In [5]:
from pydataset import data
import pyspark
import pyspark.ml
from pyspark.sql.functions import *

import wrangle
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

## Use the .randomSplit method to split the 311 data into training and test sets.

In [7]:
df = wrangle.wrangle_311(spark)

[wrangle.py] reading case.csv
[wrangle.py] handling data types
[wrangle.py] parsing dates
[wrangle.py] adding features
[wrangle.py] joining departments


In [8]:
# split our data into train, test

train, test = df.randomSplit([0.8, 0.2], 19)

In [11]:
def shape(df: pyspark.sql.DataFrame):
    return df.count(), len(df.columns)

In [12]:
shape(train)

(673349, 20)

In [13]:
shape(test)

(168355, 20)

In [15]:
train.show(1, vertical=True)

-RECORD 0------------------------------------
 case_id              | 1014127332           
 case_opened_date     | 2018-01-01 00:42:00  
 case_closed_date     | 2018-01-01 12:29:00  
 case_due_date        | 2020-09-26 00:42:00  
 case_late            | false                
 num_days_late        | -998.5087616000001   
 case_closed          | true                 
 service_request_type | Stray Animal         
 SLA_days             | 999.0                
 case_status          | Closed               
 source_id            | svcCRMLS             
 request_address      | 2315  EL PASO ST,... 
 council_district     | 005                  
 num_weeks_late       | -142.6441088         
 zipcode              | 78207                
 case_age             | 219                  
 days_to_closed       | 0                    
 case_lifetime        | 0                    
 department           | Animal Care Services 
 dept_subject_to_SLA  | true                 
only showing top 1 row



In [41]:
def spark_df_nan_count(df):
    return df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns])

spark_df_nan_count(df).show(vertical=True, truncate=False)

AnalysisException: "cannot resolve 'isnan(`case_opened_date`)' due to data type mismatch: argument 1 requires (double or float) type, however, '`case_opened_date`' is of timestamp type.;;\n'Aggregate [count(CASE WHEN (isnan(cast(case_id#10 as double)) || isnull(case_id#10)) THEN case_id END) AS case_id#4009L, count(CASE WHEN (isnan(case_opened_date#98) || isnull(case_opened_date#98)) THEN case_opened_date END) AS case_opened_date#4011, count(CASE WHEN (isnan(case_closed_date#113) || isnull(case_closed_date#113)) THEN case_closed_date END) AS case_closed_date#4013, count(CASE WHEN (isnan(case_due_date#128) || isnull(case_due_date#128)) THEN case_due_date END) AS case_due_date#4015, count(CASE WHEN (isnan(case_late#68) || isnull(case_late#68)) THEN case_late END) AS case_late#4017, count(CASE WHEN (isnan(num_days_late#15) || isnull(num_days_late#15)) THEN num_days_late END) AS num_days_late#4019L, count(CASE WHEN (isnan(case_closed#53) || isnull(case_closed#53)) THEN case_closed END) AS case_closed#4021, count(CASE WHEN (isnan(cast(service_request_type#18 as double)) || isnull(service_request_type#18)) THEN service_request_type END) AS service_request_type#4023L, count(CASE WHEN (isnan(SLA_days#19) || isnull(SLA_days#19)) THEN SLA_days END) AS SLA_days#4025L, count(CASE WHEN (isnan(cast(case_status#20 as double)) || isnull(case_status#20)) THEN case_status END) AS case_status#4027L, count(CASE WHEN (isnan(cast(source_id#21 as double)) || isnull(source_id#21)) THEN source_id END) AS source_id#4029L, count(CASE WHEN (isnan(cast(request_address#22 as double)) || isnull(request_address#22)) THEN request_address END) AS request_address#4031L, count(CASE WHEN (isnan(cast(council_district#166 as double)) || isnull(council_district#166)) THEN council_district END) AS council_district#4033L, count(CASE WHEN (isnan(num_weeks_late#150) || isnull(num_weeks_late#150)) THEN num_weeks_late END) AS num_weeks_late#4035L, count(CASE WHEN (isnan(cast(zipcode#182 as double)) || isnull(zipcode#182)) THEN zipcode END) AS zipcode#4037L, count(CASE WHEN (isnan(cast(case_age#199 as double)) || isnull(case_age#199)) THEN case_age END) AS case_age#4039L, count(CASE WHEN (isnan(cast(days_to_closed#217 as double)) || isnull(days_to_closed#217)) THEN days_to_closed END) AS days_to_closed#4041L, count(CASE WHEN (isnan(cast(case_lifetime#236 as double)) || isnull(case_lifetime#236)) THEN case_lifetime END) AS case_lifetime#4043L, count(CASE WHEN (isnan(cast(department#359 as double)) || isnull(department#359)) THEN department END) AS department#4045L, count(CASE WHEN (isnan(dept_subject_to_SLA#380) || isnull(dept_subject_to_SLA#380)) THEN dept_subject_to_SLA END) AS dept_subject_to_SLA#4047]\n+- Project [case_id#10, case_opened_date#98, case_closed_date#113, case_due_date#128, case_late#68, num_days_late#15, case_closed#53, service_request_type#18, SLA_days#19, case_status#20, source_id#21, request_address#22, council_district#166, num_weeks_late#150, zipcode#182, case_age#199, days_to_closed#217, case_lifetime#236, department#359, (dept_subject_to_SLA#269 = YES) AS dept_subject_to_SLA#380]\n   +- Project [case_id#10, case_opened_date#98, case_closed_date#113, case_due_date#128, case_late#68, num_days_late#15, case_closed#53, service_request_type#18, SLA_days#19, case_status#20, source_id#21, request_address#22, council_district#166, num_weeks_late#150, zipcode#182, case_age#199, days_to_closed#217, case_lifetime#236, standardized_dept_name#268 AS department#359, dept_subject_to_SLA#269]\n      +- Project [case_id#10, case_opened_date#98, case_closed_date#113, case_due_date#128, case_late#68, num_days_late#15, case_closed#53, service_request_type#18, SLA_days#19, case_status#20, source_id#21, request_address#22, council_district#166, num_weeks_late#150, zipcode#182, case_age#199, days_to_closed#217, case_lifetime#236, standardized_dept_name#268, dept_subject_to_SLA#269]\n         +- Project [dept_division#17, case_id#10, case_opened_date#98, case_closed_date#113, case_due_date#128, case_late#68, num_days_late#15, case_closed#53, service_request_type#18, SLA_days#19, case_status#20, source_id#21, request_address#22, council_district#166, num_weeks_late#150, zipcode#182, case_age#199, days_to_closed#217, case_lifetime#236, standardized_dept_name#268, dept_subject_to_SLA#269]\n            +- Project [dept_division#17, case_id#10, case_opened_date#98, case_closed_date#113, case_due_date#128, case_late#68, num_days_late#15, case_closed#53, service_request_type#18, SLA_days#19, case_status#20, source_id#21, request_address#22, council_district#166, num_weeks_late#150, zipcode#182, case_age#199, days_to_closed#217, case_lifetime#236, dept_name#267, standardized_dept_name#268, dept_subject_to_SLA#269]\n               +- Project [dept_division#17, case_id#10, case_opened_date#98, case_closed_date#113, case_due_date#128, case_late#68, num_days_late#15, case_closed#53, service_request_type#18, SLA_days#19, case_status#20, source_id#21, request_address#22, council_district#166, num_weeks_late#150, zipcode#182, case_age#199, days_to_closed#217, case_lifetime#236, dept_name#267, standardized_dept_name#268, dept_subject_to_SLA#269]\n                  +- Join LeftOuter, (dept_division#17 = dept_division#266)\n                     :- Project [case_id#10, case_opened_date#98, case_closed_date#113, case_due_date#128, case_late#68, num_days_late#15, case_closed#53, dept_division#17, service_request_type#18, SLA_days#19, case_status#20, source_id#21, request_address#22, council_district#166, num_weeks_late#150, zipcode#182, case_age#199, days_to_closed#217, CASE WHEN NOT case_closed#53 THEN case_age#199 ELSE days_to_closed#217 END AS case_lifetime#236]\n                     :  +- Project [case_id#10, case_opened_date#98, case_closed_date#113, case_due_date#128, case_late#68, num_days_late#15, case_closed#53, dept_division#17, service_request_type#18, SLA_days#19, case_status#20, source_id#21, request_address#22, council_district#166, num_weeks_late#150, zipcode#182, case_age#199, datediff(cast(case_closed_date#113 as date), cast(case_opened_date#98 as date)) AS days_to_closed#217]\n                     :     +- Project [case_id#10, case_opened_date#98, case_closed_date#113, case_due_date#128, case_late#68, num_days_late#15, case_closed#53, dept_division#17, service_request_type#18, SLA_days#19, case_status#20, source_id#21, request_address#22, council_district#166, num_weeks_late#150, zipcode#182, datediff(cast(1533742680000000 as date), cast(case_opened_date#98 as date)) AS case_age#199]\n                     :        +- Project [case_id#10, case_opened_date#98, case_closed_date#113, case_due_date#128, case_late#68, num_days_late#15, case_closed#53, dept_division#17, service_request_type#18, SLA_days#19, case_status#20, source_id#21, request_address#22, council_district#166, num_weeks_late#150, regexp_extract(request_address#22, \\d+$, 0) AS zipcode#182]\n                     :           +- Project [case_id#10, case_opened_date#98, case_closed_date#113, case_due_date#128, case_late#68, num_days_late#15, case_closed#53, dept_division#17, service_request_type#18, SLA_days#19, case_status#20, source_id#21, request_address#22, format_string(%03d, cast(council_district#83 as int)) AS council_district#166, num_weeks_late#150]\n                     :              +- Project [case_id#10, case_opened_date#98, case_closed_date#113, case_due_date#128, case_late#68, num_days_late#15, case_closed#53, dept_division#17, service_request_type#18, SLA_days#19, case_status#20, source_id#21, request_address#22, council_district#83, (num_days_late#15 / cast(7 as double)) AS num_weeks_late#150]\n                     :                 +- Project [case_id#10, case_opened_date#98, case_closed_date#113, to_timestamp('case_due_date, Some(M/d/yy H:mm)) AS case_due_date#128, case_late#68, num_days_late#15, case_closed#53, dept_division#17, service_request_type#18, SLA_days#19, case_status#20, source_id#21, request_address#22, council_district#83]\n                     :                    +- Project [case_id#10, case_opened_date#98, to_timestamp('case_closed_date, Some(M/d/yy H:mm)) AS case_closed_date#113, case_due_date#38, case_late#68, num_days_late#15, case_closed#53, dept_division#17, service_request_type#18, SLA_days#19, case_status#20, source_id#21, request_address#22, council_district#83]\n                     :                       +- Project [case_id#10, to_timestamp('case_opened_date, Some(M/d/yy H:mm)) AS case_opened_date#98, case_closed_date#12, case_due_date#38, case_late#68, num_days_late#15, case_closed#53, dept_division#17, service_request_type#18, SLA_days#19, case_status#20, source_id#21, request_address#22, council_district#83]\n                     :                          +- Project [case_id#10, case_opened_date#11, case_closed_date#12, case_due_date#38, case_late#68, num_days_late#15, case_closed#53, dept_division#17, service_request_type#18, SLA_days#19, case_status#20, source_id#21, request_address#22, cast(council_district#23 as string) AS council_district#83]\n                     :                             +- Project [case_id#10, case_opened_date#11, case_closed_date#12, case_due_date#38, (case_late#14 = YES) AS case_late#68, num_days_late#15, case_closed#53, dept_division#17, service_request_type#18, SLA_days#19, case_status#20, source_id#21, request_address#22, council_district#23]\n                     :                                +- Project [case_id#10, case_opened_date#11, case_closed_date#12, case_due_date#38, case_late#14, num_days_late#15, (case_closed#16 = YES) AS case_closed#53, dept_division#17, service_request_type#18, SLA_days#19, case_status#20, source_id#21, request_address#22, council_district#23]\n                     :                                   +- Project [case_id#10, case_opened_date#11, case_closed_date#12, SLA_due_date#13 AS case_due_date#38, case_late#14, num_days_late#15, case_closed#16, dept_division#17, service_request_type#18, SLA_days#19, case_status#20, source_id#21, request_address#22, council_district#23]\n                     :                                      +- Relation[case_id#10,case_opened_date#11,case_closed_date#12,SLA_due_date#13,case_late#14,num_days_late#15,case_closed#16,dept_division#17,service_request_type#18,SLA_days#19,case_status#20,source_id#21,request_address#22,council_district#23] csv\n                     +- Relation[dept_division#266,dept_name#267,standardized_dept_name#268,dept_subject_to_SLA#269] csv\n"

## Create a classification model to predict whether a case will be late or not (i.e. predict case_late). Experiment with different combinations of features and different classification algorithms.

In [19]:
train.show(1, vertical=True)

-RECORD 0------------------------------------
 case_id              | 1014127332           
 case_opened_date     | 2018-01-01 00:42:00  
 case_closed_date     | 2018-01-01 12:29:00  
 case_due_date        | 2020-09-26 00:42:00  
 case_late            | false                
 num_days_late        | -998.5087616000001   
 case_closed          | true                 
 service_request_type | Stray Animal         
 SLA_days             | 999.0                
 case_status          | Closed               
 source_id            | svcCRMLS             
 request_address      | 2315  EL PASO ST,... 
 council_district     | 005                  
 num_weeks_late       | -142.6441088         
 zipcode              | 78207                
 case_age             | 219                  
 days_to_closed       | 0                    
 case_lifetime        | 0                    
 department           | Animal Care Services 
 dept_subject_to_SLA  | true                 
only showing top 1 row



In [31]:
train.dtypes

[('case_id', 'int'),
 ('case_opened_date', 'timestamp'),
 ('case_closed_date', 'timestamp'),
 ('case_due_date', 'timestamp'),
 ('case_late', 'boolean'),
 ('num_days_late', 'double'),
 ('case_closed', 'boolean'),
 ('service_request_type', 'string'),
 ('SLA_days', 'double'),
 ('case_status', 'string'),
 ('source_id', 'string'),
 ('request_address', 'string'),
 ('council_district', 'string'),
 ('num_weeks_late', 'double'),
 ('zipcode', 'string'),
 ('case_age', 'int'),
 ('days_to_closed', 'int'),
 ('case_lifetime', 'int'),
 ('department', 'string'),
 ('dept_subject_to_SLA', 'boolean')]

In [28]:
# Using department and council_district first

In [20]:
rf = pyspark.ml.feature.RFormula(formula='case_late ~ department + council_district').fit(train)
train_input = rf.transform(train)
train_input.show(1)

+----------+-------------------+-------------------+-------------------+---------+-------------------+-----------+--------------------+------------------+-----------+---------+--------------------+----------------+--------------------+-------+--------+--------------+-------------+--------------------+-------------------+--------------------+-----+
|   case_id|   case_opened_date|   case_closed_date|      case_due_date|case_late|      num_days_late|case_closed|service_request_type|          SLA_days|case_status|source_id|     request_address|council_district|      num_weeks_late|zipcode|case_age|days_to_closed|case_lifetime|          department|dept_subject_to_SLA|            features|label|
+----------+-------------------+-------------------+-------------------+---------+-------------------+-----------+--------------------+------------------+-----------+---------+--------------------+----------------+--------------------+-------+--------+--------------+-------------+-------------------

In [21]:
lr = pyspark.ml.classification.LogisticRegression()

In [22]:
lr_fit = lr.fit(train_input)

In [23]:
lr_fit.summary.areaUnderROC

0.6360295499748985

In [24]:
evaluator = pyspark.ml.evaluation.BinaryClassificationEvaluator()
test_auc = evaluator.evaluate(lr_fit.transform(rf.transform(test)))
test_auc

0.6369172734209141

In [25]:
test_input = rf.transform(test)

In [27]:
(lr_fit.transform(test_input)
 .select('case_late', 'department', 'council_district', 'label', 'probability', 'prediction')
 .groupby('prediction') # predicted == rows
 .pivot('label') # actual values are columns
 .count()
 .show())

+----------+------+-----+
|prediction|   0.0|  1.0|
+----------+------+-----+
|       0.0|149412|18402|
|       1.0|   149|  392|
+----------+------+-----+



In [32]:
rf = pyspark.ml.feature.RFormula(formula='case_late ~ department + source_id').fit(train)
train_input = rf.transform(train)
train_input.show(1)

+----------+-------------------+-------------------+-------------------+---------+------------------+-----------+--------------------+--------+-----------+---------+--------------------+----------------+--------------+-------+--------+--------------+-------------+--------------------+-------------------+--------------------+-----+
|   case_id|   case_opened_date|   case_closed_date|      case_due_date|case_late|     num_days_late|case_closed|service_request_type|SLA_days|case_status|source_id|     request_address|council_district|num_weeks_late|zipcode|case_age|days_to_closed|case_lifetime|          department|dept_subject_to_SLA|            features|label|
+----------+-------------------+-------------------+-------------------+---------+------------------+-----------+--------------------+--------+-----------+---------+--------------------+----------------+--------------+-------+--------+--------------+-------------+--------------------+-------------------+--------------------+-----+
|

In [33]:
lr = pyspark.ml.classification.LogisticRegression()

In [34]:
lr_fit = lr.fit(train_input)

In [35]:
lr_fit.summary.areaUnderROC

0.6368578325642402

In [38]:
evaluator = pyspark.ml.evaluation.BinaryClassificationEvaluator()
# test_auc = evaluator.evaluate(lr_fit.transform(rf.transform(test)))
# test_auc

In [39]:
test_auc = evaluator.evaluate(lr_fit.transform(rf.transform(test)))
test_auc

Py4JJavaError: An error occurred while calling o817.evaluate.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 238.0 failed 1 times, most recent failure: Lost task 0.0 in stage 238.0 (TID 1282, localhost, executor driver): org.apache.spark.SparkException: Failed to execute user defined function($anonfun$9: (string) => double)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:191)
	at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:62)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Unseen label: 141954.  To handle unseen labels, set Param handleInvalid to keep.
	at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$9.apply(StringIndexer.scala:260)
	at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$9.apply(StringIndexer.scala:246)
	... 17 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1891)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2112)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2061)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2050)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:990)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:989)
	at org.apache.spark.RangePartitioner$.sketch(Partitioner.scala:309)
	at org.apache.spark.RangePartitioner.<init>(Partitioner.scala:171)
	at org.apache.spark.RangePartitioner.<init>(Partitioner.scala:151)
	at org.apache.spark.rdd.OrderedRDDFunctions$$anonfun$sortByKey$1.apply(OrderedRDDFunctions.scala:62)
	at org.apache.spark.rdd.OrderedRDDFunctions$$anonfun$sortByKey$1.apply(OrderedRDDFunctions.scala:61)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.OrderedRDDFunctions.sortByKey(OrderedRDDFunctions.scala:61)
	at org.apache.spark.mllib.evaluation.BinaryClassificationMetrics.x$4$lzycompute(BinaryClassificationMetrics.scala:155)
	at org.apache.spark.mllib.evaluation.BinaryClassificationMetrics.x$4(BinaryClassificationMetrics.scala:146)
	at org.apache.spark.mllib.evaluation.BinaryClassificationMetrics.confusions$lzycompute(BinaryClassificationMetrics.scala:148)
	at org.apache.spark.mllib.evaluation.BinaryClassificationMetrics.confusions(BinaryClassificationMetrics.scala:148)
	at org.apache.spark.mllib.evaluation.BinaryClassificationMetrics.createCurve(BinaryClassificationMetrics.scala:226)
	at org.apache.spark.mllib.evaluation.BinaryClassificationMetrics.roc(BinaryClassificationMetrics.scala:86)
	at org.apache.spark.mllib.evaluation.BinaryClassificationMetrics.areaUnderROC(BinaryClassificationMetrics.scala:97)
	at org.apache.spark.ml.evaluation.BinaryClassificationEvaluator.evaluate(BinaryClassificationEvaluator.scala:87)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function($anonfun$9: (string) => double)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:191)
	at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:62)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: org.apache.spark.SparkException: Unseen label: 141954.  To handle unseen labels, set Param handleInvalid to keep.
	at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$9.apply(StringIndexer.scala:260)
	at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$9.apply(StringIndexer.scala:246)
	... 17 more


## Create a regression model to predict how many days late a case will be (i.e. predict num_days_late). Experiment with different combinations of features and different regression algorithms.