# Model

In [1]:
%matplotlib inline
from pydataset import data

import pyspark
import pyspark.ml
from pyspark.sql.functions import *

spark = pyspark.sql.SparkSession.builder.getOrCreate()
from wrangle import wrangle_data

In [2]:
df = wrangle_data()

## 1. Use the `.randomSplit` method to split the 311 data into training and test sets.

In [3]:
train, test = df.randomSplit([0.8, 0.2], seed=123)

In [4]:
def shape(df: pyspark.sql.DataFrame):
    return df.count(), len(df.columns)

shape(train), shape(test)

((684496, 21), (170773, 21))

## 2. Create a classification model to predict whether a case will be late or not (i.e. predict `case_late`). Experiment with different combinations of features and different classification algorithms.

In [5]:
train.show(1, vertical = True)

-RECORD 0------------------------------------
 source_id            | 100137               
 dept_division        | 311 Call Center      
 case_id              | 1014263399           
 case_opened_date     | 2018-02-21 15:36:00  
 case_closed_date     | 2018-02-21 15:38:00  
 SLA_due_date         | 2018-03-02 15:36:00  
 case_late            | false                
 num_days_late        | -8.998854167000001   
 case_closed          | true                 
 service_request_type | Compliment           
 SLA_days             | 9.0                  
 case_status          | Closed               
 request_address      | 927  donaldson av... 
 council_district     | 7                    
 num_weeks_late       | -1.2855505952857145  
 case_age             | 168                  
 days_to_closed       | 0                    
 case_lifetime        | 0                    
 department           | Customer Service     
 dept_subject_to_SLA  | true                 
 source_username      | Merlene Bl

Given that there are very little number of numerical values, we will likely have to use categorical values, along with a lot of HotEncoded items.

Here the the three values that I think will be most useful:

1. What department / dept_divsion is the request to. Given the track record, we may be able to predict if they majority of requests are late or not. 
1. The service request type. There is a record of some requests taking longer, on average, than others. This might be strong enough to help the model accurately predict our target. 
1. Date that the case was open. The majority of cases are open during the week, but are there any patterns where jobs opened on ceirtain days are less / more likely to be late? 
1. Council_district. We haven't explore this, but there might be evidence that depending on the district, ceirtain jobs may take longer / less time.

In [6]:
# We will start with the department to predict if 
# the case will be late

# How many dept_divisions are there?

df.select("dept_division").distinct().count()

# becasue there are so many, we may fall to the curse of dimentionality
# so we will only use department for now.

39

In [7]:
(
    train.select("department", "case_late")
    .show(5)

)

+--------------------+---------+
|          department|case_late|
+--------------------+---------+
|    Customer Service|    false|
|DSD/Code Enforcement|    false|
|DSD/Code Enforcement|    false|
|DSD/Code Enforcement|    false|
|DSD/Code Enforcement|    false|
+--------------------+---------+
only showing top 5 rows



In [8]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

In [9]:
from pyspark.ml.feature import RFormula

In [10]:
rf = RFormula(formula="case_late ~ department").fit(train)

train_input = rf.transform(train).select("features", "label")

In [11]:
test_input = rf.transform(test)

In [12]:
train_input.show(1, vertical = True)

-RECORD 0-----------------
 features | (7,[6],[1.0]) 
 label    | 0.0           
only showing top 1 row



In [13]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

### Logistic Regression

In [14]:
lr = LogisticRegression()
# lr.explainParams() # to show all of the hyperparams
lr_fit = lr.fit(train_input)
lr_fit

LogisticRegressionModel: uid = LogisticRegression_c293e43b3819, numClasses = 2, numFeatures = 7

In [15]:
training_summary = lr_fit.summary
training_summary

<pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary at 0x1223e0e10>

In [16]:
training_summary.areaUnderROC

0.6234431279234867

In [17]:
training_summary.accuracy

0.8889255744372502

In [18]:
evaluator = pyspark.ml.evaluation.BinaryClassificationEvaluator()
test_auc = evaluator.evaluate(lr_fit.transform(rf.transform(test)))
test_auc

0.622068653787877

### Random Forest

In [19]:
train.show(1, vertical = True)

-RECORD 0------------------------------------
 source_id            | 100137               
 dept_division        | 311 Call Center      
 case_id              | 1014263399           
 case_opened_date     | 2018-02-21 15:36:00  
 case_closed_date     | 2018-02-21 15:38:00  
 SLA_due_date         | 2018-03-02 15:36:00  
 case_late            | false                
 num_days_late        | -8.998854167000001   
 case_closed          | true                 
 service_request_type | Compliment           
 SLA_days             | 9.0                  
 case_status          | Closed               
 request_address      | 927  donaldson av... 
 council_district     | 7                    
 num_weeks_late       | -1.2855505952857145  
 case_age             | 168                  
 days_to_closed       | 0                    
 case_lifetime        | 0                    
 department           | Customer Service     
 dept_subject_to_SLA  | true                 
 source_username      | Merlene Bl

In [20]:
rfc = RandomForestClassifier(featuresCol = "features", labelCol = "label", numTrees = 3, maxDepth = 2, seed=42)
rfc_fit = rfc.fit(train_input)
rfc_fit

RandomForestClassificationModel (uid=RandomForestClassifier_d23196f8f23e) with 3 trees

In [21]:
predictions = rfc_fit.transform(train_input)
predictions.show(1)

+-------------+-----+--------------------+--------------------+----------+
|     features|label|       rawPrediction|         probability|prediction|
+-------------+-----+--------------------+--------------------+----------+
|(7,[6],[1.0])|  0.0|[0.90492982422414...|[0.30164327474138...|       1.0|
+-------------+-----+--------------------+--------------------+----------+
only showing top 1 row



In [22]:
evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))

Test Area Under ROC: 0.5719910826943205


### Quick Evaluation

In [23]:
predictions = rfc_fit.transform(test_input)
evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))

Test Area Under ROC: 0.5707985065354458


## Modeling 2

Now we will try to model using both `department` and `service_request_type`

In [44]:
from pyspark.ml.feature import  OneHotEncoder

In [49]:
# X_train = train.select("department", "service_request_type", "case_late")

In [51]:
# encoder = OneHotEncoder()
# encoder_fit = encoder.fit(X_train.select("department", "service_request_tyoe"))

In [78]:
rf = RFormula(formula="case_late ~ department + service_request_type").fit(train)

train_input = rf.transform(train).select("features", "label")

In [79]:
test_input = rf.transform(test).select("features", "label")

In [80]:
lr = LogisticRegression()
# lr.explainParams() # to show all of the hyperparams
lr_fit = lr.fit(train_input)
lr_fit

LogisticRegressionModel: uid = LogisticRegression_e1e094df505b, numClasses = 2, numFeatures = 340

In [81]:
training_summary = lr_fit.summary
training_summary

<pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary at 0x122560d10>

In [82]:
training_summary.areaUnderROC

0.8162315987666309

In [83]:
training_summary.accuracy

0.9006626773567705

Potential error:

* evaluation
* input

In [87]:
evaluator = pyspark.ml.evaluation.BinaryClassificationEvaluator()

In [89]:
test_predictions = lr_fit.transform(test_input)

In [91]:
test_predictions.show(1)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(340,[6,154],[1.0...|  0.0|[3.51747782592652...|[0.97118099611191...|       0.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 1 row



In [92]:
# test_auc = evaluator.evaluate(test_predictions)

# calculate values manually

### Random Forest

In [31]:
train.show(1, vertical = True)

-RECORD 0------------------------------------
 source_id            | 100137               
 dept_division        | 311 Call Center      
 case_id              | 1014263399           
 case_opened_date     | 2018-02-21 15:36:00  
 case_closed_date     | 2018-02-21 15:38:00  
 SLA_due_date         | 2018-03-02 15:36:00  
 case_late            | false                
 num_days_late        | -8.998854167000001   
 case_closed          | true                 
 service_request_type | Compliment           
 SLA_days             | 9.0                  
 case_status          | Closed               
 request_address      | 927  donaldson av... 
 council_district     | 7                    
 num_weeks_late       | -1.2855505952857145  
 case_age             | 168                  
 days_to_closed       | 0                    
 case_lifetime        | 0                    
 department           | Customer Service     
 dept_subject_to_SLA  | true                 
 source_username      | Merlene Bl

In [32]:
rfc = RandomForestClassifier(featuresCol = "features", labelCol = "label", numTrees = 1, maxDepth = 10, seed=42)
rfc_fit = rfc.fit(train_input)
rfc_fit

RandomForestClassificationModel (uid=RandomForestClassifier_efdaf48239f3) with 1 trees

In [33]:
predictions = rfc_fit.transform(train_input)
predictions.show(1)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(340,[6,154],[1.0...|  0.0|[0.91734251519362...|[0.91734251519362...|       0.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 1 row



In [34]:
evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))

Test Area Under ROC: 0.64119186823807


### Quick Evaluation

In [35]:
predictions.show(1)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(340,[6,154],[1.0...|  0.0|[0.91734251519362...|[0.91734251519362...|       0.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 1 row



In [73]:
predictions = rfc_fit.transform(test_input)
evaluator = BinaryClassificationEvaluator()
# print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))

In [75]:
predictions.show(1)

+--------------------+------------+--------------------+--------------------+----------+
|            features|       label|       rawPrediction|         probability|prediction|
+--------------------+------------+--------------------+--------------------+----------+
|(340,[6,154],[1.0...|-10.99946759|[0.91734251519362...|[0.91734251519362...|       0.0|
+--------------------+------------+--------------------+--------------------+----------+
only showing top 1 row



## 3. Create a regression model to predict how many days late a case will be (i.e. predict `num_days_late`). Experiment with different combinations of features and different regression algorithms.

In [37]:
train.show(1, vertical=True)

-RECORD 0------------------------------------
 source_id            | 100137               
 dept_division        | 311 Call Center      
 case_id              | 1014263399           
 case_opened_date     | 2018-02-21 15:36:00  
 case_closed_date     | 2018-02-21 15:38:00  
 SLA_due_date         | 2018-03-02 15:36:00  
 case_late            | false                
 num_days_late        | -8.998854167000001   
 case_closed          | true                 
 service_request_type | Compliment           
 SLA_days             | 9.0                  
 case_status          | Closed               
 request_address      | 927  donaldson av... 
 council_district     | 7                    
 num_weeks_late       | -1.2855505952857145  
 case_age             | 168                  
 days_to_closed       | 0                    
 case_lifetime        | 0                    
 department           | Customer Service     
 dept_subject_to_SLA  | true                 
 source_username      | Merlene Bl

For predicting num_days_late, we will use the same features as above

In [52]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.feature import VectorAssembler

varIdxer = StringIndexer(inputCol='department',outputCol='varIdx').fit(train)
df_1 = varIdxer.transform(train)

In [56]:
df_1 = OneHotEncoder(inputCol="varIdx", outputCol="varCat").transform(df_1)

In [57]:
assembler = VectorAssembler(inputCols=["varCat"],outputCol="features")
df_1 =  assembler.transform(df_1)

In [70]:
df_1 = df_1.withColumnRenamed("num_days_late", "y").select("features", "y")

In [71]:
df_1.show(1)

+-------------+------------------+
|     features|                 y|
+-------------+------------------+
|(7,[6],[1.0])|-8.998854167000001|
+-------------+------------------+
only showing top 1 row



In [99]:
df_1 = df_1.na.drop()

In [100]:
lr = LinearRegression(labelCol='y',featuresCol='features')

In [101]:
lr_fit = lr.fit(df_1)

In [102]:
training_summary = lr_fit.summary

In [104]:
training_summary.r2, training_summary.rootMeanSquaredError

(0.17058279341800175, 161.05689305874301)

In [103]:
training_summary.meanAbsoluteError, training_summary.meanSquaredError

(66.17415734155266, 25939.322801735383)

-----

## Multivaraite Regression Problem

In [110]:
train_modeling = train.withColumnRenamed("num_days_late", "y").select("y")

In [126]:
val = train.select("department")

In [134]:
train_modeling.join(val)

DataFrame[y: string]

In [108]:
features = ["department", "service_request_type"]
# string_indexer_df = pd.DataFrame()
train_modeling = train.withColumnRenamed("num_days_late", "y").select("y")

for i in features:
        varIdxer = StringIndexer(inputCol=i,outputCol=f'varIdx_{i}').fit(train)
        df_1 = varIdxer.transform(train)
        print(f"Indexed {i}")
        df_1 = OneHotEncoder(inputCol=f"varIdx_{i}", outputCol=f"varCat_{i}").transform(df_1)
        print(f"Encoded {i}")
        train_modeling.join

assembler = VectorAssembler(inputCols=[f"varCat_{features[0]}, varCat_{features[1]}"],outputCol="features")

Indexed department
Encoded department
Indexed service_request_type
Encoded service_request_type


In [109]:
df_1.show(1, vertical=True)

-RECORD 0-------------------------------------------
 source_id                   | 100137               
 dept_division               | 311 Call Center      
 case_id                     | 1014263399           
 case_opened_date            | 2018-02-21 15:36:00  
 case_closed_date            | 2018-02-21 15:38:00  
 SLA_due_date                | 2018-03-02 15:36:00  
 case_late                   | false                
 num_days_late               | -8.998854167000001   
 case_closed                 | true                 
 service_request_type        | Compliment           
 SLA_days                    | 9.0                  
 case_status                 | Closed               
 request_address             | 927  donaldson av... 
 council_district            | 7                    
 num_weeks_late              | -1.2855505952857145  
 case_age                    | 168                  
 days_to_closed              | 0                    
 case_lifetime               | 0              

In [52]:
varIdxer = StringIndexer(inputCol='department',outputCol='varIdx').fit(train)
df_1 = varIdxer.transform(train)

In [56]:
df_1 = OneHotEncoder(inputCol="varIdx", outputCol="varCat").transform(df_1)

In [57]:
assembler = VectorAssembler(inputCols=["varCat"],outputCol="features")
df_1 =  assembler.transform(df_1)

In [70]:
df_1 = df_1.withColumnRenamed("num_days_late", "y").select("features", "y")

In [71]:
df_1.show(1)

+-------------+------------------+
|     features|                 y|
+-------------+------------------+
|(7,[6],[1.0])|-8.998854167000001|
+-------------+------------------+
only showing top 1 row



In [99]:
df_1 = df_1.na.drop()

In [100]:
lr = LinearRegression(labelCol='y',featuresCol='features')

In [101]:
lr_fit = lr.fit(df_1)

In [102]:
training_summary = lr_fit.summary

In [104]:
training_summary.r2, training_summary.rootMeanSquaredError

(0.17058279341800175, 161.05689305874301)

In [103]:
training_summary.meanAbsoluteError, training_summary.meanSquaredError

(66.17415734155266, 25939.322801735383)

In [None]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.feature import VectorAssembler

varIdxer = StringIndexer(inputCol='department',outputCol='varIdx').fit(train)
df_1 = varIdxer.transform(train)

df_1 = OneHotEncoder(inputCol="varIdx", outputCol="varCat").transform(df_1)

assembler = VectorAssembler(inputCols=["varCat"],outputCol="features")
df_1 =  assembler.transform(df_1)

df_1 = df_1.withColumnRenamed("num_days_late", "y").select("features", "y")

df_1.show(1)

df_1 = df_1.na.drop()

lr = LinearRegression(labelCol='y',featuresCol='features')

lr_fit = lr.fit(df_1)

training_summary = lr_fit.summary

training_summary.r2, training_summary.rootMeanSquaredError

training_summary.meanAbsoluteError, training_summary.meanSquaredError