# Get Data

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import io
import os
import sys
from datetime import datetime
from multiprocessing import cpu_count
from typing import List
from urllib.request import urlretrieve

import boto3
import pandas as pd
# from contexttimer import Timer
# from joblib import Parallel, delayed

In [None]:
PROJ_ROOT = os.path.join(os.pardir)
src_dir = os.path.join(PROJ_ROOT, "src")
sys.path.append(src_dir)

In [None]:
%aimport pandas_utils
import pandas_utils as pu

## About

Get data using notebook running inside a dev container.

## User Inputs

In [None]:
# dataset
# # base url to access data files
base_url = 'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata'
# # list of year and month combinations of data to be loaded
ym_list = ['2023-01', '2023-02', '2023-03']
# # columns to be loaded
cols_to_load = [
    'lpep_pickup_datetime',
    'RatecodeID',
    'trip_distance',
]

# AWS
# # private s3 bucket name
s3_bucket_name = 'oss-shared-scratchp'
# # profile from .aws/credentials
aws_profile_name = 'default'

In [None]:
urls = [f"{base_url}_{ym}.parquet" for ym in ym_list]

## Use `boto3`

Verify the following

1. check that `~/.aws` folder exists inside container
2. check that the AWS profile specified in the **User Inputs** section is found in the `~/.aws/credentials` file

In [None]:
# verify .aws folder exists
assert '.aws' in os.listdir(os.path.expanduser("~"))
# verify required profile is found in .aws/credentials
assert aws_profile_name in boto3.session.Session().available_profiles

Define S3 client

In [None]:
session = boto3.session.Session(profile_name=aws_profile_name)
s3_client = session.client('s3')

Verify S3 buckets can be accessed

In [None]:
assert len(s3_client.list_buckets()['Buckets']) >= 1
assert s3_bucket_name in [
    b['Name'] for b in s3_client.list_buckets()['Buckets']
]

## Get Data

### Run ETL Pipeline to Process Data

In [None]:
def run_etl_process(
    url: str, columns: List[str], s3_bucket_name: str
) -> pd.DataFrame:
    """."""
    with Timer() as t:
        # extract
        df_raw = (
            pd.read_parquet(
                url,
                columns=columns,
                filters=[('RatecodeID', 'in', [2, 3, 4, 5])],
            )    
        )

        # transform
        df = df_raw.pipe(pu.convert_dtypes_auto)

        # load
        curr_dtime = datetime.now().strftime("%Y%m%d_%H%M%S")
        proc_fname = (
            f"{os.path.basename(url).replace('.parquet', '').replace('-', '_')}_"
            f"{curr_dtime}.parquet.gzip"
        )
        out_buffer = io.BytesIO(
            df.to_parquet(engine="pyarrow", index=False, compression="gzip")
        )
        response = s3_client.put_object(
            Bucket=s3_bucket_name,
            Key=os.path.join('raw', proc_fname),
            Body=out_buffer.getvalue(),
        )
        assert response['ResponseMetadata']['HTTPStatusCode'] == 200
    duration = t.elapsed

    print(
        f"Loaded {len(df_raw):,} rows and exported {len(df):,} rows of "
        f"data to {proc_fname} in {duration:.3f}s\n"
    )
    return df

In [None]:
%%time
executor = Parallel(n_jobs=cpu_count(), backend='multiprocessing')
tasks = (
    delayed(run_etl_process)(
        f, cols_to_load, s3_bucket_name
    )
    for f in urls
)
df = pd.concat(executor(tasks), ignore_index=True)
print(f"Loaded {len(df):,} rows of data")
pu.show_df(df)
df.info()