In [None]:
# Install dependencies
%pip install xgboost boto3 awswrangler seaborn --upgrade

## Downloading the data

In [None]:
import pandas as pd
import numpy as np
import boto3

# To handle import certain amount of rows for parquet files
import awswrangler as wr

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import matplotlib

matplotlib.rcParams['figure.figsize'] = 10,10


In [None]:
NUMBER_OF_RECORDS_USED = 1_000_000

In [None]:
import boto3
def list_s3_files_in_folder_using_client(bucket_name, prefix):
    """
    This function will list down all files in a folder from S3 bucket
    :return: None
    """
    s3_client = boto3.client("s3")
    # bucket_name = "testbucket-frompython-2"
    response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
    files = response.get("Contents")
    file_count = 0
    size = 0
    for file in files:
        file_count += 1
        size += file['Size']
        # print(f"file_name: {file['Key']}, size: {(file['Size']/1_000_000):,} MB")
        # print(f"File counts: {file_count}; Total size: {size/1_000_000_000:,} GB")
    print(f"File counts: {file_count}; Total size: {size/1_000_000_000:,} GB")
    

In [None]:
list_s3_files_in_folder_using_client('dsoaws', 'nyc-taxi-orig-cleaned-split-parquet-per-year-multiple-files')

In [None]:
%%time

# Using aws wrangler
nyc_df_fare = wr.s3.read_parquet(path="s3://dsoaws/nyc-taxi-orig-cleaned-split-parquet-per-year-multiple-files/ride-fare/year=2019", dataset=True)
nyc_df_fare

In [None]:
nyc_df_fare.shape

In [None]:
%%time

# Using aws wrangler
nyc_df_info = wr.s3.read_parquet(path="s3://dsoaws/nyc-taxi-orig-cleaned-split-parquet-per-year-multiple-files/ride-info/year=2019", dataset=True)
nyc_df_info


In [None]:
%%time

# Join different tables
nyc_df = nyc_df_fare.merge(nyc_df_info, on='ride_id')

In [None]:
%%time

nyc_df.head(10)
nyc_df.info()


## Data Preprocessing

In [None]:
# Check for any null entries
nyc_df.isnull().sum()

## EDA

In [None]:
nyc_df.columns

In [None]:
nyc_df['log_total_amount']  = np.log(nyc_df['total_amount'].values+1)
plt.title('Distribution of total amount')
sns.histplot(nyc_df['log_total_amount'].values,bins=100)
plt.show()

In [None]:
df = nyc_df.groupby('passenger_count')['total_amount'].mean()
# df
plt.title('Distribution of total_amount with respect to the passenger_count')
sns.barplot(x=df.index,y=df.values)
plt.show()

In [None]:
nyc_df['total_amount'].describe()

Looks like there are some negative in total amount, removing these rows

In [None]:
nyc_df = nyc_df[nyc_df['total_amount'] >= 0]
nyc_df

In [None]:
nyc_df.info()

### Drop low-value column and move target column to first

In [None]:
dropped_columns = ['ride_id', 'vendor_id', 'year_x', 'pickup_at', 'dropoff_at', 'store_and_fwd_flag', 'year_y', 'log_total_amount']
columns_to_keep = [col for col in nyc_df.columns.tolist() if col not in dropped_columns]
rearranged_cols = ['total_amount'] + [col for col in columns_to_keep if col != 'total_amount']
df = nyc_df[rearranged_cols]
df.head(5)
df.info()

## Splitting the data

Since we need data to train, validate and test them let's split the data using `train_test_split` from sklearn module

Refernce:https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [None]:
from sklearn.model_selection import train_test_split

train_df , validation_df = train_test_split(df, test_size=0.3,random_state=42)

In [None]:
train_df

In [None]:
%%time

train_df.info()

In [None]:
validation_df.head(5)
validation_df.info()

### Save to s3

In [None]:
# Write out the data
import sagemaker
import awswrangler as wr

sagemaker_session = sagemaker.Session()
output_bucket = sagemaker_session.default_bucket()


prefix = f'nyc-taxi-2019-single-notebook-parquet'
output_key_prefix = f"{prefix}"
print(f'Output S3: s3://{output_bucket}/{output_key_prefix}')

train_output_s3_uri = f's3://{output_bucket}/{output_key_prefix}/train/'
validation_output_s3_uri = f's3://{output_bucket}/{output_key_prefix}/validation/'

# https://aws-sdk-pandas.readthedocs.io/en/stable/stubs/awswrangler.s3.to_parquet.html#awswrangler.s3.to_parquet
wr.s3.to_parquet(train_df.copy(), train_output_s3_uri, dataset=True, mode='overwrite', max_rows_by_file=5_000_000)
wr.s3.to_parquet(validation_df.copy(), validation_output_s3_uri, dataset=True, mode='overwrite', max_rows_by_file=5_000_000)

print(f'Output train data s3 URI: {train_output_s3_uri}')
print(f'Output validation data s3 URI: {validation_output_s3_uri}')