# Assignment 1: NYC Taxi Data
## Machine Learning Model
Predict the **total fare amount** of a trip based for the last _3_ months of data (train data on the remaining dataset). The field _fare_amount_ can not be used as a feature in the model.

The final model will be assessed using the **RMSE** score.

In [1]:
import pandas as pd
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor, GBTRegressor
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler, VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
# Create a local spark session
spark = SparkSession.builder \
  .appName('nyc-taxi-model') \
  .getOrCreate()

In [3]:
# Set parameters 
target_field = "total_amount"
category_columns = ["trip_duration_category"]
number_columns = ["trip_distance_km","trip_duration_seconds","pickup_hour","month","year","passenger_count"]
gbt_num_cols = ["trip_distance_km","trip_duration_seconds"]
file_loc = "./output"
stages = []

In [4]:
def create_model_dataframe(file_loc, cat_cols, num_cols, tgt_col):
    # Read data from parquet
    df = spark.read.parquet(file_loc)
    
    # Select only required fields from source and rename target column
    if isinstance(tgt_col, list):
        select_cols = cat_cols + num_cols + tgt_col
    else:
        select_cols = cat_cols + num_cols + [tgt_col]   
    df = df.select(select_cols).withColumnRenamed(target_field, "target")
    
    return df

In [5]:
# https://www.timlrx.com/blog/feature-selection-using-feature-importance-score-creating-a-pyspark-estimator
def get_feature_importance(importance, dataset, features):
    list_extract = []
    
    for i in dataset.schema[features].metadata["ml_attr"]["attrs"]:
        list_extract = list_extract + dataset.schema[features].metadata["ml_attr"]["attrs"][i]
    varlist = pd.DataFrame(list_extract)
    varlist['score'] = varlist['idx'].apply(lambda x: importance[x])
    return(varlist.sort_values('score', ascending = False))

In [6]:
def get_train_dataset(df):
    df = df.filter((F.col("year") == 2017) | 
                   (F.col("year") == 2018) & (F.col("month").isin([1,2,3,4,5,6,7,8,9])))
    return df

In [7]:
def get_test_dataset(df):
    df = df.filter((F.col("year") == 2018) & (F.col("month").isin([10,11,12])))
    return df

In [8]:
def run_random_forest_model(df, cat_cols, num_cols, trees=20, sub_sample=0.7, feature_imp=True):
    stages = []
           
    # For category columns implement One Hot Encoding for each field
    for col in cat_cols:
        column_indexer = StringIndexer(inputCol=col, outputCol=f"{col}_ind")
        column_encoder = OneHotEncoderEstimator(inputCols=[f"{col}_ind"], outputCols=[f"{col}_ohe"])
        stages += [column_indexer, column_encoder]
    
    # Create a list of category fields that have been OHE
    cat_cols_ohe = [f"{col}_ohe" for col in cat_cols]
    
    # Instantiate a VectorAssembler of all categorical and number columns
    assembler = VectorAssembler(inputCols=cat_cols_ohe + num_cols, outputCol='features')
    
    # Add to stages list
    stages += [assembler]
    
    # Instantiate a pipeline with stages
    pipeline = Pipeline(stages=stages)

    # Fit the pipeline with model data frame
    pipeline_model = pipeline.fit(df)
    
    # Get test/train data
    train_data = get_train_dataset(df)
    test_data = get_test_dataset(df)
    
    # Apply the pipeline to the dataframe
    test_data = pipeline_model.transform(test_data)
    train_data = pipeline_model.transform(train_data)
    
    # Train a Random Forest model to predict the target field
    rf = RandomForestRegressor(featuresCol='features', labelCol='target', seed=77,
                               numTrees=trees, subsamplingRate=sub_sample)

    # Use the training data to create a model
    rf_model = rf.fit(train_data)
    rf_train = rf_model.transform(train_data)
    
    # Test the model using 3 months of trip data
    rf_test = rf_model.transform(test_data)
    
    # Evaluate predictions using the RMSE
    evaluator = RegressionEvaluator(labelCol="target", predictionCol="prediction", metricName="rmse")
    rmse = evaluator.evaluate(rf_test)

    string = "The RMSE on the test data is {}".format(rmse)
    print(string)

    # If required get feature importance
    if feature_imp:
        df_imp = get_feature_importance(rf_model.featureImportances, train_data, "features").head(10)
        display(df_imp)

In [284]:
# Read data from parquet
df_model = create_model_dataframe(file_loc, category_columns, number_columns, target_field)

# Run random forest model
run_random_forest_model(df_model, category_columns, number_columns, feature_imp=True)

The RMSE on the test data is 5.57140677904781


Unnamed: 0,idx,name,score
0,4,trip_distance_km,0.512466
1,5,trip_duration_seconds,0.363379
9,3,trip_duration_category_ohe_20-30 mins,0.037776
6,0,trip_duration_category_ohe_10-20 mins,0.032827
8,2,trip_duration_category_ohe_Under 5 mins,0.028285
7,1,trip_duration_category_ohe_5-10 mins,0.023351
2,6,pickup_hour,0.001379
4,8,year,0.000314
5,9,passenger_count,0.000171
3,7,month,5.1e-05


In [9]:
def run_gb_tree_model(df, cat_cols, num_cols, max_depth=5, max_iterations=20):
    stages = []
           
    # For category columns implement One Hot Encoding for each field
    for col in cat_cols:
        column_indexer = StringIndexer(inputCol=col, outputCol=f"{col}_ind")
        column_encoder = OneHotEncoderEstimator(inputCols=[f"{col}_ind"], outputCols=[f"{col}_ohe"])
        stages += [column_indexer, column_encoder]
    
    # Create a list of category fields that have been OHE
    cat_cols_ohe = [f"{col}_ohe" for col in cat_cols]
    
    # Instantiate a VectorAssembler of all categorical and number columns
    assembler = VectorAssembler(inputCols=cat_cols_ohe + num_cols, outputCol='features')
    
    # Add to stages list
    stages += [assembler]
    
    # Instantiate a pipeline with stages
    pipeline = Pipeline(stages=stages)

    # Fit the pipeline with model data frame
    pipeline_model = pipeline.fit(df)
    
    # Get test/train data
    train_data = get_train_dataset(df)
    test_data = get_test_dataset(df)
    
    # Apply the pipeline to the dataframe
    test_data = pipeline_model.transform(test_data)
    train_data = pipeline_model.transform(train_data)
    
    # Train a Random Forest model to predict the target field
    gbt = GBTRegressor(featuresCol='features', labelCol='target', seed= 77,
                       maxDepth=max_depth, maxIter=max_iterations)

    # Use the training data to create a model
    gbt_model = gbt.fit(train_data)
    gbt_train = gbt.transform(train_data)
    
    # Test the model using 3 months of trip data
    gbt_test = gbt_model.transform(test_data)
    
    # Evaluate predictions using the RMSE
    evaluator = RegressionEvaluator(labelCol="target", predictionCol="prediction", metricName="rmse")
    rmse = evaluator.evaluate(gbt_test)

    string = "The RMSE on the test data is {}".format(rmse)
    print(string)

In [None]:
# Read data from parquet
df_model = create_model_dataframe(file_loc, category_columns, gbt_num_cols, target_field)

# Run random forest model
run_gb_tree_model(df_model, category_columns, gbt_num_cols)