## Upload Files to Cloud Storage

In [None]:
import os
from glob import glob

import boto3
from dotenv import find_dotenv, load_dotenv

## About

This notebook will upload files to private personal cloud storage.

The following are the files that will be uploaded and they must exist before running this notebook
- `data/raw/combo_batched_listings.zip`
  - zipped file with listings data retrieved using Selenium
  - consists of multiple `.zip` files (eg. `batched_listings__<page-number>_<listing-range>__*.zip`)
- `data/raw/combo_batched_listings_requests.zip`
  - zipped file with listings data retrieved using the Python `requests` library
  - consists of multiple `.zip` files (eg. `batched_listings__<page-number>_<listing-range>__*.zip`)
- `data/raw/combo_batched_search_results.zip`
  - zipped file with search results retrieved using Selenium
- `data/raw/combo_batched_search_results_requests.zip`
  - zipped file with search results retrieved using the Python `requests` library
- `data/raw/listings.parquet.gzip`
  - combined contents of all listings data retrieved using Selenium
  - taken from files in `data/raw/combo_batched_listings.zip`
- `data/raw/listings_requests.parquet.gzip`
  - combined contents of all listings data retrieved using the Python `requests` library
  - taken from files in `data/raw/combo_batched_listings_requests.zip`
- `data/raw/search_results.parquet.gzip`
  - combined contents of all search results retrieved using Selenium
  - taken from files in `data/raw/combo_batched_search_results.zip`
- `data/raw/search_results_requests.parquet.gzip`
  - combined contents of all search results retrieved using the Python `requests` library
  - taken from files in `data/raw/combo_batched_search_results_requests.zip`

All files will be deleted from cloud storage on December 31, 2022.

## User Inputs

In [None]:
data_dir = "data/raw"

In [None]:
load_dotenv(find_dotenv())
aws_region = os.getenv("AWS_REGION", default="us-east-2")
s3_bucket_name = os.getenv("AWS_S3_BUCKET_NAME")

In [None]:
def block_public_access_to_s3_bucket(
    s3_bucket_name: str,
    aws_region: str = "us-east-2",
) -> None:
    """Block access to S3 bucket."""
    s3_client = boto3.client("s3", region_name=aws_region)
    set_public_access_response = s3_client.put_public_access_block(
        Bucket=s3_bucket_name,
        PublicAccessBlockConfiguration={
            "BlockPublicAcls": True,
            "IgnorePublicAcls": True,
            "BlockPublicPolicy": True,
            "RestrictPublicBuckets": True,
        },
    )
    try:
        http_status_code = set_public_access_response["ResponseMetadata"][
            "HTTPStatusCode"
        ]
        assert http_status_code == 200
        print(f"Bucket {s3_bucket_name} access blocked successfully")
    except AssertionError as e:
        print(
            f"Bucket {s3_bucket_name} access was not successfully blocked. "
            f"Got error message:\n{e}"
        )

## Create AWS Python SDK Objects

In [None]:
s3_client = boto3.client("s3", region_name=aws_region)

## Create Cloud Storage Object

Create AWS S3 bucket

In [None]:
%%time
bucket_creation_response = s3_client.create_bucket(
    ACL='private',
    Bucket=s3_bucket_name,
    CreateBucketConfiguration={"LocationConstraint": aws_region},
)
assert bucket_creation_response['ResponseMetadata']['HTTPStatusCode'] == 200

Block public access to S3 bucket

In [None]:
%%time
block_public_access_to_s3_bucket(s3_bucket_name, aws_region)

## Upload Files to Cloud Storage

In [None]:
%%time
for f in glob(f"{data_dir}/*"):
    print(f"Uploading file from {f} to bucket {s3_bucket_name}...")
    s3_client.upload_file(f, s3_bucket_name, os.path.basename(f))
    print("Done.")

## List All Files in Cloud Storage

In [None]:
%%time
obj_list = [f['Key'] for f in s3_client.list_objects_v2(Bucket=s3_bucket_name)['Contents']]
obj_list