# Proposed Solution 1:  SageMaker + Glue Interactive Sessions

In [None]:
%stop_session

In [None]:
%help

In [None]:
%glue_version 3.0
%etl
%additional_python_modules aws-glue-sessions,xgboost, sagemaker, matplotlib, seaborn,psutil
%number_of_workers 50
%worker_type G.2X

In [None]:
spark

In [None]:
import sys

print(sys.version)

In [None]:
# Dataset:
data_s3_bucket = 'dsoaws'
data_s3_prefix =  'nyc-taxi-orig-cleaned-split-parquet-per-year-multiple-files'
ride_fare_s3_uri = f's3://{data_s3_bucket}/{data_s3_prefix}/ride-fare'
ride_info_s3_uri = f's3://{data_s3_bucket}/{data_s3_prefix}/ride-info'

In [None]:
import boto3
def list_s3_files_in_folder_using_client(bucket_name, prefix):
    """
    This function will list down all files in a folder from S3 bucket
    :return: None
    """
    s3_client = boto3.client("s3")
    # bucket_name = "testbucket-frompython-2"
    response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
    files = response.get("Contents")
    file_count = 0
    size = 0
    for file in files:
        file_count += 1
        size += file['Size']
        # print(f"file_name: {file['Key']}, size: {(file['Size']/1_000_000):,} MB")
        # print(f"File counts: {file_count}; Total size: {size/1_000_000_000:,} GB")
    print(f"File counts: {file_count}; Total size: {size/1_000_000_000:,} GB")
    

In [None]:
list_s3_files_in_folder_using_client(data_s3_bucket, data_s3_prefix)

In [None]:
df_fare = spark.read.option("recursiveFileLookup", "true").parquet(ride_fare_s3_uri)

# The following command caches the DataFrame in memory. This improves performance since subsequent calls to the DataFrame can read from memory instead of re-reading the data from disk.
#df.cache()

In [None]:
df_fare.cache()


In [None]:
df_fare.show(10)

In [None]:
print(f"The dataset (ride-fare) has {df_fare.count():,} rows.")

In [None]:
df_info = spark.read.option("recursiveFileLookup", "true").parquet(ride_info_s3_uri)
df_info.show(10)
print(f"The dataset (ride-info) has {df_info.count():,} rows.")

In [None]:
# Join data based on ride_id
df = df_fare.join(df_info, 'ride_id')
df.show(10)
print(f"The dataset has {df.count():,} rows.")

## Preprocess data

The goal is to predict the `total_amount` (typically called the `fare`) of each ride.  To simplify the pre-processing, we may want to drop certain features like `pickup_at` and `dropoff_at` since taxi fares do not depend on the time of day, typically (unlike ride-share fares like Uber and Lyft).

We may also want to drop unused fields like `store_and_fwd_flag` which is an edge case where the taxi-meter was disconnected during the trip.  This should not impact the fare.

TODO:  Describe why we should drop the `rate_code_id` - or otherwise explain how it could be used.

Lastly, the `payment_type` are not useful for this predictive model as the fare should not depend on how the user is paying.

In [None]:
#Count of null for all columns
from pyspark.sql.functions import col, when, count
df.select([count(when(col(c).isNull() , c)).alias(c) for c in df.columns]
   ).show()

In [None]:
df.printSchema()

### VISUALIZATION

#### Set Seaborn Parameters


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

sns.set_style = "seaborn-whitegrid"

sns.set(
    rc={
        "font.style": "normal",
        "axes.facecolor": "white",
        "grid.color": ".8",
        "grid.linestyle": "-",
        "figure.facecolor": "white",
        "figure.titlesize": 20,
        "text.color": "black",
        "xtick.color": "black",
        "ytick.color": "black",
        "axes.labelcolor": "black",
        "axes.grid": True,
        "axes.labelsize": 10,
        "xtick.labelsize": 10,
        "font.size": 10,
        "ytick.labelsize": 10,
    }
)


In [None]:
# Helper Code to Display Values on Bars
def show_values_barplot(axs, space):
    def _show_on_plot(ax):
        for p in ax.patches:
            _x = p.get_x() + p.get_width() + float(space)
            _y = p.get_y() + p.get_height()
            value = round(float(p.get_width()), 2)
            ax.text(_x, _y, value, ha="left")

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _show_on_plot(ax)
    else:
        _show_on_plot(axs)
        

In [None]:
plt.clf()


# df_viz = df.limit(1000)
df_viz = df.sample(fraction=0.0001)
print(f'Number of sample(s): {df_viz.count():,}')


# Store number of categories for later
num_categories = df_viz.toPandas().shape[0]

# Create plot
barplot = sns.barplot(y="total_amount", x="passenger_count", data=df_viz.toPandas(), saturation=1)

if num_categories < 10:
    sns.set(rc={"figure.figsize": (10.0, 5.0)})

# Set title and x-axis ticks
plt.title("Total Amount by Passenger Count")

# Show graphic
#plt.show(barplot)
plt.show()

# This is how to display the plot with this notebook
%matplot plt

In [None]:
import matplotlib.pyplot as plt

# %matplot inline 

# plt.switch_backend('agg')

df.limit(10).createOrReplaceTempView("passenger")
df.limit(10).toPandas().plot.bar(x='passenger_count',y='total_amount')

plt.show()
%matplot plt

In [None]:
# Drop some certain column
columns_to_drop = ['ride_id', 'vendor_id', 
                   'pickup_at', 'dropoff_at', 
                   'store_and_fwd_flag', 
                   # 'mta_tax', 
                   # 'payment_type', 'rate_code_id'
                  ]
df = df.drop(*columns_to_drop)
df.show(10)
df.printSchema()

#### Rearrange target column to be the first columns

In [None]:
df_columns = df.columns
feature_columns = [feature for feature in df_columns if feature != 'total_amount']
df = df.select(['total_amount'] + feature_columns)
df.show(10)
df.printSchema()

#### Examine 'total_amount'

Some rows with 'total_amount' of negative number

In [None]:
df.select('total_amount').summary().show()

Drop rows with negative 'total_amount'

In [None]:
df = df[(df.total_amount >= 0)]


#### Split data into training and test sets

Randomly split data into training and test sets. By doing this, you can train and tune the model using only the training subset, and then evaluate the model's performance on the test set to get a sense of how the model will perform on new data. 

In [None]:
# Split the dataset randomly into 70% for training and 30% for testing. Passing a seed for deterministic behavior
df_train, df_validation = df.randomSplit([0.7, 0.3], seed = 42)
print(f"There are {df_train.count():,} training examples and {df_validation.count():,} validation examples.")

#### Visualize the data


In [None]:
df_train.select("passenger_count", "total_amount").show()

In [None]:
# Write out the data
# import boto3
import sagemaker
import time

sagemaker_session = sagemaker.Session()
output_bucket = sagemaker_session.default_bucket()
# role = sagemaker.get_execution_role()
# region = boto3.Session().region_name

prefix = f'gsml-nyc-taxi-all-years-glue-is/{time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())}'
output_key_prefix = f"{prefix}/output"
print(f'Output S3: s3://{output_bucket}/{output_key_prefix}')

train_output_s3_uri = f's3://{output_bucket}/{output_key_prefix}/train'
validation_output_s3_uri = f's3://{output_bucket}/{output_key_prefix}/validation'

df_train.write.option("mode", "overwrite").parquet(train_output_s3_uri)
df_validation.write.option("mode", "overwrite").parquet(validation_output_s3_uri)

print(f'Output train data s3 URI: {train_output_s3_uri}')
print(f'Output validation data s3 URI: {validation_output_s3_uri}')

In [None]:
# Verify output data
output_df_train = spark.read.option("recursiveFileLookup", "true").parquet(train_output_s3_uri)
output_df_validataion = spark.read.option("recursiveFileLookup", "true").parquet(validation_output_s3_uri)
print(f"There are {output_df_train.count():,} training examples and {output_df_validataion.count():,} validation examples.")
