### Creating Model 1

* Creating a 1st model for the NFL Play data. The goal is to predict the play type on 4th down.
* Random Forest and gradient boosted trees are the other models

In [2]:
from pyspark.ml import Pipeline  
from pyspark.ml.feature import *  
from pyspark.ml.classification import RandomForestClassifier
import nfl_data_py as nfl
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTENC
import pandas as pd
from imblearn.combine import SMOTEENN
from DataPipelineFxn import *
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql import functions as F

In [3]:
# get the spark session and data frame
data = GetSparkDF(include_undersample = True)

2000 done.
2001 done.
2002 done.
2003 done.
2004 done.
2005 done.
2006 done.
2007 done.
2008 done.
2009 done.
2010 done.
2011 done.
2012 done.
2013 done.
2014 done.
2015 done.
2016 done.
2017 done.
2018 done.
2019 done.
2020 done.
2021 done.
2022 done.
2023 done.


/opt/conda/lib/python3.7/site-packages/pyspark/bin/load-spark-env.sh: line 68: ps: command not found
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/12/04 01:05:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/12/04 01:05:32 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/12/04 01:05:50 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
24/12/04 01:05:51 WARN TaskSetManager: Stage 0 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:05:53 WARN TaskSetManager: Stage 3 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:05:59 WARN TaskSetManager: Stage 4 contains a task of very large size (2197 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:06:03 WARN TaskSetManager: Stage 5 contains a task of very large size (1216 KiB). The maximum recommended task size is 1000 KiB.


In [4]:
spark, df, test_df, undersample_df = data

### Create numeric columns list for the pipeline

In [5]:
numeric_cols = []
for field in df.schema.fields:
    if str(field.dataType) != "StringType()":
        numeric_cols.append(field.name)
        
numeric_cols.remove("season")
numeric_cols.remove("week")

In [6]:
print(numeric_cols)

['yardline_100', 'quarter_seconds_remaining', 'half_seconds_remaining', 'game_seconds_remaining', 'down', 'drive', 'qtr', 'ydstogo', 'posteam_timeouts_remaining', 'defteam_timeouts_remaining', 'posteam_score', 'defteam_score', 'score_differential', 'ep', 'epa', 'wind', 'temp']


### Create the Model Pipeline

In [7]:

str_col = ["home_team", "away_team", "season_type", "posteam", "posteam_type", "defteam", "side_of_field", "game_half", "week",
        "play_type", "season", 'roof', 'surface']
str_col_output = ["home_team_idx", "away_team_idx", "season_type_idx", "posteam_idx", "posteam_type_idx", "defteam_idx", "week_idx",
                  "side_of_field_idx", "game_half_idx", "play_type_idx", "season_idx", 'roof_idx', 'surface_idx']
ohe_col_input = ["home_team_idx", "away_team_idx", "season_type_idx", "posteam_idx", "posteam_type_idx", "defteam_idx", "week_idx",
                  "side_of_field_idx", "game_half_idx", "season_idx", 'roof_idx', 'surface_idx']
ohe_col_vec = ["home_team_vec", "away_team_vec", "season_type_vec", "posteam_vec", "posteam_type_vec", "defteam_vec", "week_vec",
                  "side_of_field_vec", "game_half_ivec", "season_vec", 'roof_vec', 'surface_vec']

# Combine numeric and one-hot encoded columns
va_input_cols = numeric_cols + ohe_col_vec

# process using string indexer first for catgeorical features  
stringIndexer = StringIndexer(inputCols=str_col, outputCols=str_col_output)

# process rating data into second feature
ohe = OneHotEncoder(inputCols=ohe_col_input, outputCols=ohe_col_vec) 


# Assemble features column
va = VectorAssembler(inputCols=va_input_cols, outputCol="features", handleInvalid = "skip")

# process data using maxabs scaler, not necessarily important for trees but consistency
scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures")

# define the model

#spark.conf.set("spark.speculation", "false")
# Use Random Forest Classifier
rf = RandomForestClassifier(labelCol='play_type_idx', 
                             featuresCol='scaledFeatures', 
                             seed=42)


# Fit the pipeline
pipeline = Pipeline(stages=[stringIndexer, ohe, va, scaler, rf])

In [8]:

# Set up hyperparameter tuning
paramGrid = ParamGridBuilder()\
    .addGrid(rf.numTrees, [10, 50, 100])\
    .addGrid(rf.maxDepth, [5, 10, 15])\
    .addGrid(rf.impurity, ["gini", "entropy"])\
    .build()


# build the evaluator
evaluator = MulticlassClassificationEvaluator(
    predictionCol="prediction",
    labelCol="play_type_idx",
    metricName="accuracy"
)

# set up the train validation split
tvs = TrainValidationSplit(
    estimator=pipeline,
    evaluator=evaluator,
    estimatorParamMaps=paramGrid,
    # 80% of the data will be used for training, 20% for validation.
    trainRatio=0.8
)

In [None]:
df.select(df.roof).distinct().show()

In [9]:
# need to replace the empty string in surface
df = df.withColumn(
    'surface',
    F.when(df.surface == '', 'Undefined').otherwise(df.surface)
)

# fit the model with the hyperparameter search
rf_model = tvs.fit(df)

24/12/04 01:06:52 WARN TaskSetManager: Stage 6 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:06:54 WARN TaskSetManager: Stage 9 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:06:56 WARN TaskSetManager: Stage 12 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:06:56 WARN TaskSetManager: Stage 13 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:06:57 WARN TaskSetManager: Stage 14 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:06:59 WARN TaskSetManager: Stage 16 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:07:01 WARN TaskSetManager: Stage 18 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:07:02 WARN TaskSetManager: Stage 20 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:07:02 WARN TaskSetManager: Stage 22 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:03 WARN TaskSetManager: Stage 24 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:04 WARN TaskSetManager: Stage 26 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:07:05 WARN TaskSetManager: Stage 28 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:05 WARN TaskSetManager: Stage 31 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:07:06 WARN TaskSetManager: Stage 34 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:06 WARN TaskSetManager: Stage 35 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:07:07 WARN TaskSetManager: Stage 36 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:07:08 WARN TaskSetManager: Stage 38 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:07:09 WARN TaskSetManager: Stage 40 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:09 WARN TaskSetManager: Stage 42 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:09 WARN TaskSetManager: Stage 44 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:10 WARN TaskSetManager: Stage 46 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:10 WARN TaskSetManager: Stage 48 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:11 WARN TaskSetManager: Stage 50 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:11 WARN TaskSetManager: Stage 53 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/

                                                                                

24/12/04 01:07:13 WARN TaskSetManager: Stage 60 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:14 WARN TaskSetManager: Stage 62 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:14 WARN TaskSetManager: Stage 64 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:14 WARN TaskSetManager: Stage 66 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:14 WARN TaskSetManager: Stage 68 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:14 WARN TaskSetManager: Stage 70 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:14 WARN TaskSetManager: Stage 72 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/

                                                                                

24/12/04 01:07:19 WARN TaskSetManager: Stage 92 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:07:20 WARN TaskSetManager: Stage 94 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:20 WARN TaskSetManager: Stage 96 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:20 WARN TaskSetManager: Stage 98 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:20 WARN TaskSetManager: Stage 100 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:20 WARN TaskSetManager: Stage 102 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:20 WARN TaskSetManager: Stage 104 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:21 WARN TaskSetManager: Stage 106 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24

                                                                                

24/12/04 01:07:40 WARN TaskSetManager: Stage 210 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:40 WARN TaskSetManager: Stage 212 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:41 WARN TaskSetManager: Stage 214 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:41 WARN TaskSetManager: Stage 216 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:41 WARN TaskSetManager: Stage 218 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:42 WARN TaskSetManager: Stage 220 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:42 WARN TaskSetManager: Stage 223 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.

                                                                                

24/12/04 01:07:44 WARN TaskSetManager: Stage 232 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:45 WARN TaskSetManager: Stage 234 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:45 WARN TaskSetManager: Stage 236 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:45 WARN TaskSetManager: Stage 238 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:46 WARN TaskSetManager: Stage 240 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:46 WARN TaskSetManager: Stage 242 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:46 WARN TaskSetManager: Stage 245 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.

                                                                                

24/12/04 01:07:48 WARN TaskSetManager: Stage 254 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:48 WARN TaskSetManager: Stage 256 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:49 WARN TaskSetManager: Stage 258 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:49 WARN TaskSetManager: Stage 260 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:49 WARN TaskSetManager: Stage 262 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:50 WARN DAGScheduler: Broadcasting large task binary with size 1323.7 KiB
24/12/04 01:07:50 WARN TaskSetManager: Stage 264 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:50 WARN DAGScheduler: Broadcasting lar

                                                                                

24/12/04 01:07:53 WARN DAGScheduler: Broadcasting large task binary with size 3.5 MiB
24/12/04 01:07:53 WARN TaskSetManager: Stage 272 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:54 WARN TaskSetManager: Stage 274 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:54 WARN TaskSetManager: Stage 277 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:54 WARN TaskSetManager: Stage 280 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:55 WARN TaskSetManager: Stage 281 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:55 WARN TaskSetManager: Stage 282 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:07:55 WARN TaskSetManager: Stage 284 contain

                                                                                

24/12/04 01:08:00 WARN DAGScheduler: Broadcasting large task binary with size 5.2 MiB
24/12/04 01:08:00 WARN TaskSetManager: Stage 302 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:08:02 WARN DAGScheduler: Broadcasting large task binary with size 3.7 MiB
24/12/04 01:08:02 WARN TaskSetManager: Stage 304 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:02 WARN TaskSetManager: Stage 306 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:02 WARN TaskSetManager: Stage 309 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:03 WARN TaskSetManager: Stage 312 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:03 WARN TaskSetManager: Stage 313 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:03 WARN TaskSetManager: Stage 314 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:04 WARN TaskSetManager: Stage 316 contain

                                                                                

24/12/04 01:08:05 WARN TaskSetManager: Stage 318 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:05 WARN TaskSetManager: Stage 320 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:05 WARN TaskSetManager: Stage 322 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:05 WARN TaskSetManager: Stage 324 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:06 WARN TaskSetManager: Stage 326 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:06 WARN DAGScheduler: Broadcasting large task binary with size 1323.7 KiB
24/12/04 01:08:06 WARN TaskSetManager: Stage 328 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:07 WARN DAGScheduler: Broadcasting lar

                                                                                

24/12/04 01:08:08 WARN DAGScheduler: Broadcasting large task binary with size 4.9 MiB
24/12/04 01:08:08 WARN TaskSetManager: Stage 334 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:08:10 WARN DAGScheduler: Broadcasting large task binary with size 7.1 MiB
24/12/04 01:08:10 WARN TaskSetManager: Stage 336 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


[Stage 336:>                                                        (0 + 3) / 3]

24/12/04 01:08:11 WARN DAGScheduler: Broadcasting large task binary with size 1037.3 KiB


                                                                                

24/12/04 01:08:11 WARN DAGScheduler: Broadcasting large task binary with size 9.9 MiB
24/12/04 01:08:11 WARN TaskSetManager: Stage 338 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.




24/12/04 01:08:12 WARN DAGScheduler: Broadcasting large task binary with size 1271.3 KiB


                                                                                

24/12/04 01:08:13 WARN DAGScheduler: Broadcasting large task binary with size 13.3 MiB
24/12/04 01:08:13 WARN TaskSetManager: Stage 340 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


[Stage 340:>                                                        (0 + 3) / 3]

24/12/04 01:08:15 WARN DAGScheduler: Broadcasting large task binary with size 1494.8 KiB


                                                                                

24/12/04 01:08:16 WARN DAGScheduler: Broadcasting large task binary with size 17.2 MiB
24/12/04 01:08:16 WARN TaskSetManager: Stage 342 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


[Stage 342:>                                                        (0 + 3) / 3]

24/12/04 01:08:17 WARN DAGScheduler: Broadcasting large task binary with size 1696.7 KiB


                                                                                

24/12/04 01:08:19 WARN DAGScheduler: Broadcasting large task binary with size 21.6 MiB
24/12/04 01:08:19 WARN TaskSetManager: Stage 344 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


[Stage 344:>                                                        (0 + 3) / 3]

24/12/04 01:08:20 WARN DAGScheduler: Broadcasting large task binary with size 1853.1 KiB


                                                                                

24/12/04 01:08:22 WARN DAGScheduler: Broadcasting large task binary with size 13.8 MiB
24/12/04 01:08:22 WARN TaskSetManager: Stage 346 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:08:23 WARN TaskSetManager: Stage 348 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:23 WARN TaskSetManager: Stage 351 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:24 WARN TaskSetManager: Stage 354 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:24 WARN TaskSetManager: Stage 355 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:24 WARN TaskSetManager: Stage 356 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:25 WARN TaskSetManager: Stage 358 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:08:26 WARN TaskSetManager: Stage 360 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:26 WARN TaskSetManager: Stage 362 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:26 WARN TaskSetManager: Stage 364 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:26 WARN TaskSetManager: Stage 366 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:27 WARN TaskSetManager: Stage 368 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:27 WARN DAGScheduler: Broadcasting large task binary with size 1354.4 KiB
24/12/04 01:08:27 WARN TaskSetManager: Stage 370 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:28 WARN DAGScheduler: Broadcasting lar

                                                                                

24/12/04 01:08:30 WARN DAGScheduler: Broadcasting large task binary with size 5.2 MiB
24/12/04 01:08:30 WARN TaskSetManager: Stage 376 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:08:31 WARN DAGScheduler: Broadcasting large task binary with size 7.7 MiB
24/12/04 01:08:31 WARN TaskSetManager: Stage 378 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


[Stage 378:>                                                        (0 + 3) / 3]

24/12/04 01:08:32 WARN DAGScheduler: Broadcasting large task binary with size 1141.7 KiB


                                                                                

24/12/04 01:08:33 WARN DAGScheduler: Broadcasting large task binary with size 10.8 MiB
24/12/04 01:08:33 WARN TaskSetManager: Stage 380 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.




24/12/04 01:08:34 WARN DAGScheduler: Broadcasting large task binary with size 1425.9 KiB


                                                                                

24/12/04 01:08:35 WARN DAGScheduler: Broadcasting large task binary with size 14.6 MiB
24/12/04 01:08:35 WARN TaskSetManager: Stage 382 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


[Stage 382:>                                                        (0 + 3) / 3]

24/12/04 01:08:36 WARN DAGScheduler: Broadcasting large task binary with size 1700.2 KiB


                                                                                

24/12/04 01:08:38 WARN DAGScheduler: Broadcasting large task binary with size 19.1 MiB
24/12/04 01:08:38 WARN TaskSetManager: Stage 384 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.




24/12/04 01:08:40 WARN DAGScheduler: Broadcasting large task binary with size 1937.7 KiB


                                                                                

24/12/04 01:08:42 WARN DAGScheduler: Broadcasting large task binary with size 24.0 MiB
24/12/04 01:08:42 WARN TaskSetManager: Stage 386 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.




24/12/04 01:08:43 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB


                                                                                

24/12/04 01:08:45 WARN DAGScheduler: Broadcasting large task binary with size 15.3 MiB
24/12/04 01:08:45 WARN TaskSetManager: Stage 388 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:08:46 WARN TaskSetManager: Stage 390 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:46 WARN TaskSetManager: Stage 393 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:47 WARN TaskSetManager: Stage 396 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:47 WARN TaskSetManager: Stage 397 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:47 WARN TaskSetManager: Stage 398 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:48 WARN TaskSetManager: Stage 400 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:08:49 WARN TaskSetManager: Stage 402 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:49 WARN TaskSetManager: Stage 404 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:49 WARN TaskSetManager: Stage 406 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:50 WARN TaskSetManager: Stage 408 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:51 WARN TaskSetManager: Stage 410 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:51 WARN TaskSetManager: Stage 412 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:51 WARN TaskSetManager: Stage 415 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.

                                                                                

24/12/04 01:08:54 WARN TaskSetManager: Stage 424 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:54 WARN TaskSetManager: Stage 426 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:54 WARN TaskSetManager: Stage 428 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:55 WARN TaskSetManager: Stage 430 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:56 WARN TaskSetManager: Stage 432 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:56 WARN TaskSetManager: Stage 434 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:56 WARN TaskSetManager: Stage 437 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.

                                                                                

24/12/04 01:08:59 WARN TaskSetManager: Stage 446 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:08:59 WARN TaskSetManager: Stage 448 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:09:00 WARN TaskSetManager: Stage 450 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:09:00 WARN TaskSetManager: Stage 452 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:09:01 WARN DAGScheduler: Broadcasting large task binary with size 1424.2 KiB
24/12/04 01:09:01 WARN TaskSetManager: Stage 454 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:09:02 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB
24/12/04 01:09:02 WARN TaskSetManager: Stage 456 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:09:03 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/12/04 01:09:03 WARN TaskSetManager: Stage 458 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:09:05 WARN DAGScheduler: Broadcasting large task binary with size 6.3 MiB
24/12/04 01:09:05 WARN TaskSetManager: Stage 460 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


[Stage 460:>                                                        (0 + 3) / 3]

24/12/04 01:09:06 WARN DAGScheduler: Broadcasting large task binary with size 1186.0 KiB


                                                                                

24/12/04 01:09:07 WARN DAGScheduler: Broadcasting large task binary with size 9.7 MiB
24/12/04 01:09:07 WARN TaskSetManager: Stage 462 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


[Stage 462:>                                                        (0 + 3) / 3]

24/12/04 01:09:09 WARN DAGScheduler: Broadcasting large task binary with size 1619.2 KiB


                                                                                

24/12/04 01:09:10 WARN DAGScheduler: Broadcasting large task binary with size 6.9 MiB
24/12/04 01:09:10 WARN TaskSetManager: Stage 464 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:09:11 WARN TaskSetManager: Stage 466 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:09:11 WARN TaskSetManager: Stage 469 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:09:11 WARN TaskSetManager: Stage 472 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:09:11 WARN TaskSetManager: Stage 473 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:09:12 WARN TaskSetManager: Stage 474 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:09:12 WARN TaskSetManager: Stage 476 contain

                                                                                

24/12/04 01:09:13 WARN TaskSetManager: Stage 478 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:09:14 WARN TaskSetManager: Stage 480 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:09:14 WARN TaskSetManager: Stage 482 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:09:14 WARN TaskSetManager: Stage 484 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:09:15 WARN DAGScheduler: Broadcasting large task binary with size 1437.3 KiB
24/12/04 01:09:15 WARN TaskSetManager: Stage 486 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:09:16 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB
24/12/04 01:09:16 WARN TaskSetManager: Stage 488 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:09:17 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
24/12/04 01:09:17 WARN TaskSetManager: Stage 490 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:09:19 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB
24/12/04 01:09:19 WARN TaskSetManager: Stage 492 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.




24/12/04 01:09:21 WARN DAGScheduler: Broadcasting large task binary with size 1288.1 KiB


                                                                                

24/12/04 01:09:22 WARN DAGScheduler: Broadcasting large task binary with size 10.3 MiB
24/12/04 01:09:22 WARN TaskSetManager: Stage 494 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.




24/12/04 01:09:25 WARN DAGScheduler: Broadcasting large task binary with size 1781.0 KiB


                                                                                

24/12/04 01:09:26 WARN DAGScheduler: Broadcasting large task binary with size 7.1 MiB
24/12/04 01:09:26 WARN TaskSetManager: Stage 496 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:09:27 WARN TaskSetManager: Stage 498 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:09:27 WARN TaskSetManager: Stage 501 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:09:27 WARN TaskSetManager: Stage 504 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:09:27 WARN TaskSetManager: Stage 505 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:09:28 WARN TaskSetManager: Stage 506 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:09:28 WARN TaskSetManager: Stage 508 contain

                                                                                

24/12/04 01:09:29 WARN TaskSetManager: Stage 510 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:09:30 WARN TaskSetManager: Stage 512 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:09:30 WARN TaskSetManager: Stage 514 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:09:30 WARN TaskSetManager: Stage 516 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:09:31 WARN DAGScheduler: Broadcasting large task binary with size 1424.2 KiB
24/12/04 01:09:31 WARN TaskSetManager: Stage 518 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:09:32 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB
24/12/04 01:09:32 WARN TaskSetManager: Stage 520 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:09:33 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/12/04 01:09:33 WARN TaskSetManager: Stage 522 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:09:35 WARN DAGScheduler: Broadcasting large task binary with size 6.3 MiB
24/12/04 01:09:35 WARN TaskSetManager: Stage 524 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.




24/12/04 01:09:37 WARN DAGScheduler: Broadcasting large task binary with size 1186.0 KiB


                                                                                

24/12/04 01:09:37 WARN DAGScheduler: Broadcasting large task binary with size 9.7 MiB
24/12/04 01:09:37 WARN TaskSetManager: Stage 526 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.




24/12/04 01:09:39 WARN DAGScheduler: Broadcasting large task binary with size 1619.2 KiB


                                                                                

24/12/04 01:09:40 WARN DAGScheduler: Broadcasting large task binary with size 14.2 MiB
24/12/04 01:09:40 WARN TaskSetManager: Stage 528 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.




24/12/04 01:09:43 WARN DAGScheduler: Broadcasting large task binary with size 2.0 MiB


                                                                                

24/12/04 01:09:44 WARN DAGScheduler: Broadcasting large task binary with size 19.9 MiB
24/12/04 01:09:44 WARN TaskSetManager: Stage 530 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.




24/12/04 01:09:47 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB


                                                                                

24/12/04 01:09:49 WARN DAGScheduler: Broadcasting large task binary with size 26.7 MiB
24/12/04 01:09:49 WARN TaskSetManager: Stage 532 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


[Stage 532:>                                                        (0 + 3) / 3]

24/12/04 01:09:52 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB


[Stage 534:>                                                        (0 + 0) / 3]

24/12/04 01:09:55 WARN DAGScheduler: Broadcasting large task binary with size 34.6 MiB
24/12/04 01:09:55 WARN TaskSetManager: Stage 534 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


[Stage 534:>                                                        (0 + 3) / 3]

24/12/04 01:09:59 WARN DAGScheduler: Broadcasting large task binary with size 3.3 MiB


[Stage 536:>                                                        (0 + 0) / 3]

24/12/04 01:10:02 WARN DAGScheduler: Broadcasting large task binary with size 43.5 MiB
24/12/04 01:10:02 WARN TaskSetManager: Stage 536 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.




24/12/04 01:10:06 WARN DAGScheduler: Broadcasting large task binary with size 3.7 MiB


                                                                                

24/12/04 01:10:10 WARN DAGScheduler: Broadcasting large task binary with size 27.8 MiB
24/12/04 01:10:10 WARN TaskSetManager: Stage 538 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:10:12 WARN TaskSetManager: Stage 540 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:10:12 WARN TaskSetManager: Stage 543 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:10:13 WARN TaskSetManager: Stage 546 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:10:13 WARN TaskSetManager: Stage 547 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:10:13 WARN TaskSetManager: Stage 548 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:10:14 WARN TaskSetManager: Stage 550 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:10:15 WARN TaskSetManager: Stage 552 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:10:15 WARN TaskSetManager: Stage 554 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:10:15 WARN TaskSetManager: Stage 556 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:10:16 WARN TaskSetManager: Stage 558 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:10:17 WARN DAGScheduler: Broadcasting large task binary with size 1437.3 KiB
24/12/04 01:10:17 WARN TaskSetManager: Stage 560 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:10:18 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB
24/12/04 01:10:18 WARN TaskSetManager: Stage 562 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:10:19 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
24/12/04 01:10:19 WARN TaskSetManager: Stage 564 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:10:21 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB
24/12/04 01:10:21 WARN TaskSetManager: Stage 566 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


[Stage 566:>                                                        (0 + 3) / 3]

24/12/04 01:10:22 WARN DAGScheduler: Broadcasting large task binary with size 1288.1 KiB


                                                                                

24/12/04 01:10:23 WARN DAGScheduler: Broadcasting large task binary with size 10.3 MiB
24/12/04 01:10:23 WARN TaskSetManager: Stage 568 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.




24/12/04 01:10:26 WARN DAGScheduler: Broadcasting large task binary with size 1781.0 KiB


                                                                                

24/12/04 01:10:27 WARN DAGScheduler: Broadcasting large task binary with size 15.3 MiB
24/12/04 01:10:27 WARN TaskSetManager: Stage 570 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.




24/12/04 01:10:29 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB


                                                                                

24/12/04 01:10:31 WARN DAGScheduler: Broadcasting large task binary with size 21.6 MiB
24/12/04 01:10:31 WARN TaskSetManager: Stage 572 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.




24/12/04 01:10:34 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


                                                                                

24/12/04 01:10:37 WARN DAGScheduler: Broadcasting large task binary with size 29.2 MiB
24/12/04 01:10:37 WARN TaskSetManager: Stage 574 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.




24/12/04 01:10:40 WARN DAGScheduler: Broadcasting large task binary with size 3.3 MiB


[Stage 576:>                                                        (0 + 0) / 3]

24/12/04 01:10:43 WARN DAGScheduler: Broadcasting large task binary with size 38.1 MiB
24/12/04 01:10:43 WARN TaskSetManager: Stage 576 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.




24/12/04 01:10:47 WARN DAGScheduler: Broadcasting large task binary with size 3.8 MiB


[Stage 578:>                                                        (0 + 0) / 3]

24/12/04 01:10:51 WARN DAGScheduler: Broadcasting large task binary with size 48.0 MiB
24/12/04 01:10:51 WARN TaskSetManager: Stage 578 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.




24/12/04 01:10:56 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB


                                                                                

24/12/04 01:10:59 WARN DAGScheduler: Broadcasting large task binary with size 30.2 MiB
24/12/04 01:10:59 WARN TaskSetManager: Stage 580 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:11:01 WARN TaskSetManager: Stage 582 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:11:02 WARN TaskSetManager: Stage 585 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:11:03 WARN TaskSetManager: Stage 588 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:11:03 WARN TaskSetManager: Stage 589 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:11:04 WARN TaskSetManager: Stage 590 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:11:06 WARN TaskSetManager: Stage 592 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:11:07 WARN TaskSetManager: Stage 594 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:11:08 WARN TaskSetManager: Stage 596 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:11:08 WARN TaskSetManager: Stage 598 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:11:09 WARN TaskSetManager: Stage 600 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:11:09 WARN TaskSetManager: Stage 602 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:11:10 WARN DAGScheduler: Broadcasting large task binary with size 1334.8 KiB
24/12/04 01:11:10 WARN TaskSetManager: Stage 604 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:11:10 WARN DAGScheduler: Broadcasting lar

                                                                                

24/12/04 01:11:12 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB
24/12/04 01:11:12 WARN TaskSetManager: Stage 608 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:11:13 WARN DAGScheduler: Broadcasting large task binary with size 5.1 MiB
24/12/04 01:11:13 WARN TaskSetManager: Stage 610 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

24/12/04 01:11:15 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/04 01:11:15 WARN TaskSetManager: Stage 612 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.




24/12/04 01:11:17 WARN DAGScheduler: Broadcasting large task binary with size 1147.7 KiB


                                                                                

24/12/04 01:11:18 WARN DAGScheduler: Broadcasting large task binary with size 10.7 MiB
24/12/04 01:11:18 WARN TaskSetManager: Stage 614 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.




24/12/04 01:11:20 WARN DAGScheduler: Broadcasting large task binary with size 1435.4 KiB


                                                                                

24/12/04 01:11:21 WARN DAGScheduler: Broadcasting large task binary with size 14.5 MiB
24/12/04 01:11:21 WARN TaskSetManager: Stage 616 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.




24/12/04 01:11:23 WARN DAGScheduler: Broadcasting large task binary with size 1721.0 KiB


                                                                                

24/12/04 01:11:25 WARN DAGScheduler: Broadcasting large task binary with size 19.0 MiB
24/12/04 01:11:25 WARN TaskSetManager: Stage 618 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.




24/12/04 01:11:27 WARN DAGScheduler: Broadcasting large task binary with size 1980.3 KiB


                                                                                

24/12/04 01:11:30 WARN DAGScheduler: Broadcasting large task binary with size 24.2 MiB
24/12/04 01:11:30 WARN TaskSetManager: Stage 620 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.




24/12/04 01:11:33 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB


                                                                                

### Evaluate the Model

In [10]:
best_model = rf_model.bestModel

# Extract the parameter map of the best model
best_params = best_model.stages[-1].extractParamMap()

# Print the parameters values
print("Tuned Parameters of the Best Model:")
for param, value in best_params.items():
    print(f"{param.name}: {value}")

Tuned Parameters of the Best Model:
bootstrap: True
cacheNodeIds: False
checkpointInterval: 10
featureSubsetStrategy: auto
featuresCol: scaledFeatures
impurity: gini
labelCol: play_type_idx
leafCol: 
maxBins: 32
maxDepth: 15
maxMemoryInMB: 256
minInfoGain: 0.0
minInstancesPerNode: 1
minWeightFractionPerNode: 0.0
numTrees: 50
predictionCol: prediction
probabilityCol: probability
rawPredictionCol: rawPrediction
seed: 42
subsamplingRate: 1.0


In [11]:
# have to change test df instance
test_df = test_df.withColumn(
    'surface',
    F.when(test_df.surface == '', 'Undefined').otherwise(test_df.surface)
)

prediction = best_model.transform(test_df)


In [12]:
f1_evaluator = MulticlassClassificationEvaluator(
    labelCol='play_type_idx', predictionCol="prediction",
    metricName='f1'
)

accuracy_evaluator = MulticlassClassificationEvaluator(
    labelCol='play_type_idx', predictionCol="prediction",
    metricName='accuracy'
)

precision_by_label_eval = MulticlassClassificationEvaluator(
    labelCol='play_type_idx', predictionCol="prediction",
    metricName='precisionByLabel'
)

weighted_precision_eval = MulticlassClassificationEvaluator(
    labelCol='play_type_idx', predictionCol="prediction",
    metricName='weightedPrecision'
)


weighted_recall_eval = MulticlassClassificationEvaluator(
    labelCol='play_type_idx', predictionCol="prediction",
    metricName='weightedRecall'
)

f1_score = f1_evaluator.evaluate(prediction)
accuracy = accuracy_evaluator.evaluate(prediction)
precision_by_label = precision_by_label_eval.evaluate(prediction)
precision = weighted_precision_eval.evaluate(prediction)
recall = weighted_recall_eval.evaluate(prediction)

24/12/04 01:11:42 WARN TaskSetManager: Stage 622 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:11:43 WARN TaskSetManager: Stage 623 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


[Stage 626:>                                                        (0 + 3) / 3]

24/12/04 01:11:46 WARN DAGScheduler: Broadcasting large task binary with size 15.6 MiB


                                                                                

24/12/04 01:11:49 WARN TaskSetManager: Stage 632 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:11:49 WARN TaskSetManager: Stage 633 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.




24/12/04 01:11:52 WARN DAGScheduler: Broadcasting large task binary with size 15.6 MiB


                                                                                

24/12/04 01:11:54 WARN TaskSetManager: Stage 642 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:11:54 WARN TaskSetManager: Stage 643 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.




24/12/04 01:11:57 WARN DAGScheduler: Broadcasting large task binary with size 15.6 MiB


                                                                                

24/12/04 01:11:58 WARN TaskSetManager: Stage 652 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:11:59 WARN TaskSetManager: Stage 653 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.




24/12/04 01:12:01 WARN DAGScheduler: Broadcasting large task binary with size 15.6 MiB


                                                                                

24/12/04 01:12:02 WARN TaskSetManager: Stage 662 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:12:03 WARN TaskSetManager: Stage 663 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.




24/12/04 01:12:04 WARN DAGScheduler: Broadcasting large task binary with size 15.6 MiB


                                                                                

In [None]:
prediction.select([prediction.play_type_idx, prediction.prediction]).show(10)

In [None]:
prediction.groupby(prediction.prediction).count().show()

In [None]:
prediction.groupby(prediction.play_type_idx).count().show()

In [13]:
labels = prediction.select([prediction.play_type, prediction.play_type_idx]).distinct().orderBy(prediction.play_type_idx).select(prediction.play_type).collect()
labels = [x[0] for x in labels]

24/12/04 01:12:10 WARN TaskSetManager: Stage 672 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:12:10 WARN TaskSetManager: Stage 673 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

In [None]:
test_df.groupby(test_df.play_type).count().show()

In [14]:
results = f'''
F1: {f1_score}
Accuracy: {accuracy}
Precision by Label: {precision_by_label}
Weighted Precision: {precision}
Weighted Recall: {recall}
'''

print(results)


F1: 0.8791488975343991
Accuracy: 0.9015169628862145
Precision by Label: 0.939925224146067
Weighted Precision: 0.8651043570532826
Weighted Recall: 0.9015169628862145



In [15]:
from pyspark.sql.types import FloatType
from pyspark.mllib.evaluation import MulticlassMetrics

#Adapting code from https://www.kaggle.com/code/ashokkumarpalivela/multiclass-classification-using-pyspark for confusion matrix
preds_and_labels = prediction.select(['prediction','play_type_idx'])\
                              .withColumn('play_type_idx', col('play_type_idx')\
                              .cast(FloatType()))\
                              .orderBy('play_type_idx')
    
metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))

print(metrics.confusionMatrix().toArray())

24/12/04 01:12:22 WARN TaskSetManager: Stage 697 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.
24/12/04 01:12:23 WARN TaskSetManager: Stage 698 contains a task of very large size (6176 KiB). The maximum recommended task size is 1000 KiB.


[Stage 701:>                                                        (0 + 3) / 3]

24/12/04 01:12:25 WARN DAGScheduler: Broadcasting large task binary with size 15.6 MiB


                                                                                

24/12/04 01:12:28 WARN DAGScheduler: Broadcasting large task binary with size 15.6 MiB




24/12/04 01:12:30 WARN DAGScheduler: Broadcasting large task binary with size 15.5 MiB


                                                                                

24/12/04 01:12:31 WARN DAGScheduler: Broadcasting large task binary with size 15.5 MiB


[Stage 719:>                                                        (0 + 1) / 1]

[[2.5894e+04 2.3200e+02 5.4000e+01 7.0000e+00 0.0000e+00]
 [9.5000e+01 6.1030e+03 5.7000e+01 8.0000e+00 0.0000e+00]
 [3.7300e+02 5.2100e+02 1.3160e+03 1.6200e+02 0.0000e+00]
 [2.7200e+02 3.7800e+02 1.7700e+02 6.2100e+02 0.0000e+00]
 [9.1500e+02 2.7800e+02 1.2100e+02 5.7000e+01 0.0000e+00]]


                                                                                

In [20]:
_ = plt.figure(figsize=(10, 10))
ax = sns.heatmap(metrics.confusionMatrix().toArray(),
            cmap='viridis',
            annot=True,fmt='0',
            cbar=False, 
            xticklabels=labels,
            yticklabels=labels)

plt.title('Random Forest Regular Sampling Confusion Matrix', fontsize = 20) # title with fontsize 20
plt.xlabel('Predicted', fontsize = 15) # x-axis label with fontsize 15
plt.ylabel('Actual', fontsize = 15) # y-axis label with fontsize 15
plt.savefig('naive-bayes-normal-confusion-matrix.png')

TypeError: 'Figure' object is not callable

### Conclusions

Some quick observations:

* Much better performance when we don't undersample, could be because everything is a punt
* We need to be careful with the labels for the confusion matrix. They were not intially lining up properly with what the idxs are.
* I added temp, wind, roof, surface to the predictors