In [1]:
import pandas as pd
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor, GBTRegressor
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler, VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
# Create a local spark session
spark = SparkSession.builder \
  .appName('nyc-taxi-model') \
  .getOrCreate()

In [34]:
# Specify data lists/terms

cols_list = ['RatecodeID','passenger_count','trip_distance','payment_type','extra',
            'mta_tax','improvement_surcharge','tip_amount','tolls_amount','taxi_type',
             'trip_duration_seconds','trip_distance_km',
            'total_amount']
cat_cols = [
    'RatecodeID','payment_type','taxi_type',
]
num_cols = ['passenger_count','trip_distance','extra',
            'mta_tax','improvement_surcharge','tip_amount','tolls_amount',
             'trip_duration_seconds','trip_distance_km']
target_col = ['total_amount']
file_loc = "./output"
stages = []

In [3]:
# Read parquet file and select cols
df = spark.read.load("./output")

In [29]:
# Read parquet file and select cols
def load_data(file_loc,cols_list):
    # Read parquet file
    df = spark.read.load(file_loc)
    
    # select cols
    df = df.select(cols_list)
    #rename target variabel to label
    df = df.withColumnRenamed("total_amount","label")
    
    return df
    

In [53]:
def get_train_dataset(df):
    df = df.filter((F.col("year") == 2019) | 
                   (F.col("year") == 2020) & (F.col("month").isin([1,2,3,4,5,6,7,8,9])))
    return df

In [54]:
def get_test_dataset(df):
    df = df.filter((F.col("year") == 2020) & (F.col("month").isin([10,11,12])))
    return df

In [55]:
# Data preprocessing
def data_preprocessing(df,target_col,cat_cols,num_cols):
    stages = []
           
    # For category columns implement One Hot Encoding for each field
    for col in cat_cols:
        column_indexer = StringIndexer(inputCol=col, outputCol=f"{col}_ind")
        column_encoder = OneHotEncoderEstimator(inputCols=[f"{col}_ind"], outputCols=[f"{col}_ohe"])
        stages += [column_indexer, column_encoder]
    
    # Create a list of category fields that have been OHE
    cat_cols_ohe = [f"{col}_ohe" for col in cat_cols]
    
    # Instantiate a VectorAssembler of all categorical and number columns
    assembler = VectorAssembler(inputCols=cat_cols_ohe + num_cols, outputCol='features')
    
    # Add to stages list
    stages += [assembler]
    
    # Instantiate a pipeline with stages
    pipeline = Pipeline(stages=stages)
    
    # Fit the pipeline with model data frame
    pipeline_model = pipeline.fit(df)
    
    # Transform the data
    df = pipeline_model.transform(df)
    
    # Fit the pipeline with model data frame
    pipeline_model = pipeline.fit(df)
    
    # Get test/train data
    train_data = get_train_dataset(df)
    test_data = get_test_dataset(df)
    
    # Apply the pipeline to the dataframe
    test_data = pipeline_model.transform(test_data)
    train_data = pipeline_model.transform(train_data)
    
    return test_data, train_data

In [56]:
# https://www.timlrx.com/blog/feature-selection-using-feature-importance-score-creating-a-pyspark-estimator
def get_feature_importance(importance, dataset, features):
    list_extract = []
    
    for i in dataset.schema[features].metadata["ml_attr"]["attrs"]:
        list_extract = list_extract + dataset.schema[features].metadata["ml_attr"]["attrs"][i]
    varlist = pd.DataFrame(list_extract)
    varlist['score'] = varlist['idx'].apply(lambda x: importance[x])
    return(varlist.sort_values('score', ascending = False))

In [57]:
def train_rf_model(train_data, test_data, trees=13, sub_sample=0.5,feature_imp=True):
    # instantiate rf model
    rf = RandomForestRegressor(featuresCol='features', labelCol='label', seed=77,
                               numTrees=trees, subsamplingRate=sub_sample)
    # train rf on training data
    rf_model = rf.fit(train_data)
    
    # predict on the training set
    train_preds = rf_model.transform(train_data)
    
    # predict on testing set
    test_preds = rf_model.transform(test_data)
    
    # Instantiate regression evaluater
    evaluator_reg = RegressionEvaluator()
    
    # specify evaluations
    rmse_metric = {evaluator_reg.metricName: "rmse"}
    mse_metric = {evaluator_reg.metricName: "mse"}
    mae_metric = {evaluator_reg.metricName: "mae"}
    
    # Evaluate predictions using the RMSE
    #evaluator = RegressionEvaluator(labelCol="target", predictionCol="prediction", metricName="rmse")
    #rmse = evaluator.evaluate(rf_test)
    
    print(f"[RMSE] train:{evaluator_reg.evaluate(train_preds, rmse_metric)} - test: {evaluator_reg.evaluate(test_preds, rmse_metric)}")
    print(f"[MSE] train:{evaluator_reg.evaluate(train_preds, mse_metric)} - test: {evaluator_reg.evaluate(test_preds, mse_metric)}")
    print(f"[MAE] train:{evaluator_reg.evaluate(train_preds, mae_metric)} - test: {evaluator_reg.evaluate(test_preds, mae_metric)}")
    
    # Optional generate feature importance
    if feature_imp:
        df_imp = get_feature_importance(rf_model.featureImportances, train_data, "features").head(10)
        display(df_imp)

In [51]:
# Load data
df = load_data(file_loc,cols_list)


In [52]:
# preprocess data
train_data, test_data = data_preprocessing(df,target_col,cat_cols,num_cols)

IllegalArgumentException: 'requirement failed: Output column RatecodeID_ind already exists.'

In [41]:
# train test split data
train_data, test_data = train_test_split(df,0.8)

In [42]:
train_data.show(20)

+----------+---------------+-------------+------------+-----+-------+---------------------+----------+------------+---------+---------------------+----------------+-----+--------------+--------------+----------------+----------------+-------------+-------------+--------------------+
|RatecodeID|passenger_count|trip_distance|payment_type|extra|mta_tax|improvement_surcharge|tip_amount|tolls_amount|taxi_type|trip_duration_seconds|trip_distance_km|label|RatecodeID_ind|RatecodeID_ohe|payment_type_ind|payment_type_ohe|taxi_type_ind|taxi_type_ohe|            features|
+----------+---------------+-------------+------------+-----+-------+---------------------+----------+------------+---------+---------------------+----------------+-----+--------------+--------------+----------------+----------------+-------------+-------------+--------------------+
|         1|              0|          0.0|           1|  0.0|    0.0|                  0.0|       0.0|         0.0|   yellow|                    2| 

In [46]:
# train random forest model
train_rf_model(train_data, test_data, trees=13, sub_sample=0.5,feature_imp=True)

Py4JJavaError: An error occurred while calling o1052.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 18 in stage 28.0 failed 1 times, most recent failure: Lost task 18.0 in stage 28.0 (TID 454, localhost, executor driver): org.apache.spark.SparkException: Failed to execute user defined function($anonfun$9: (string) => double)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.sort_addToSorter_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1817)
	at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1213)
	at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1213)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: StringIndexer encountered NULL value. To handle or skip NULLS, try setting StringIndexer.handleInvalid.
	at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$9.apply(StringIndexer.scala:251)
	at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$9.apply(StringIndexer.scala:246)
	... 20 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1891)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2112)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2061)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2050)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD.count(RDD.scala:1213)
	at org.apache.spark.ml.tree.impl.DecisionTreeMetadata$.buildMetadata(DecisionTreeMetadata.scala:118)
	at org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:106)
	at org.apache.spark.ml.regression.RandomForestRegressor$$anonfun$train$1.apply(RandomForestRegressor.scala:133)
	at org.apache.spark.ml.regression.RandomForestRegressor$$anonfun$train$1.apply(RandomForestRegressor.scala:119)
	at org.apache.spark.ml.util.Instrumentation$$anonfun$11.apply(Instrumentation.scala:185)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:185)
	at org.apache.spark.ml.regression.RandomForestRegressor.train(RandomForestRegressor.scala:119)
	at org.apache.spark.ml.regression.RandomForestRegressor.train(RandomForestRegressor.scala:46)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:82)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function($anonfun$9: (string) => double)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.sort_addToSorter_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1817)
	at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1213)
	at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1213)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: org.apache.spark.SparkException: StringIndexer encountered NULL value. To handle or skip NULLS, try setting StringIndexer.handleInvalid.
	at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$9.apply(StringIndexer.scala:251)
	at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$9.apply(StringIndexer.scala:246)
	... 20 more
