## Tree-Based Multinomial Classification Models

#### Setup

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
.master("local[*]") \
.appName("spark_setup") \
.getOrCreate()

In [2]:
# Read in train data
train_data = spark.read.option("header",True).option("inferSchema",True).csv("train_data")
train_data.count()

2200579

In [3]:
train_data.columns

['line_str',
 'resident_status',
 'education',
 'education_flag',
 'death_month',
 'sex',
 'age_flag',
 'age',
 'age_sub_flag',
 'place_of_death',
 'marital_status',
 'death_day_of_week',
 'death_year',
 'work_injury',
 'death_manner',
 'disposition_method',
 'autopsy',
 'activity_code',
 'place_of_injury',
 'ucd',
 'ucd_recode_113',
 'ucd_recode_39',
 'mcd_count_R',
 'mcd_1_R',
 'mcd_2_R',
 'mcd_3_R',
 'mcd_4_R',
 'mcd_5_R',
 'mcd_6_R',
 'mcd_7_R',
 'mcd_8_R',
 'mcd_9_R',
 'mcd_10_R',
 'mcd_11_R',
 'mcd_12_R',
 'mcd_13_R',
 'mcd_14_R',
 'mcd_15_R',
 'mcd_16_R',
 'mcd_17_R',
 'mcd_18_R',
 'mcd_19_R',
 'mcd_20_R',
 'race',
 'race_bridged_flag',
 'race_imputed_flag',
 'hispanic',
 'hispanic_race_recode']

Will use the UCD 39-cause recode for dimension reduction.

In [4]:
# subset with potential features
training = train_data.select("ucd", "education", "sex", "age", "marital_status", "hispanic_race_recode")

In [5]:
training.printSchema()

root
 |-- ucd: string (nullable = true)
 |-- education: integer (nullable = true)
 |-- sex: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- hispanic_race_recode: integer (nullable = true)



In [6]:
training.show(5)

+----+---------+---+---+--------------+--------------------+
| ucd|education|sex|age|marital_status|hispanic_race_recode|
+----+---------+---+---+--------------+--------------------+
| F03|       14|  F| 94|             W|                   6|
|J449|       12|  F| 70|             M|                   6|
|I219|       16|  M| 88|             M|                   6|
|C259|       12|  M| 77|             M|                   6|
|F102|       12|  F| 46|             D|                   1|
+----+---------+---+---+--------------+--------------------+
only showing top 5 rows



In [7]:
# strip UCD codes to include first letter only
from pyspark.sql.functions import *

training = training.withColumn("ucd_short", regexp_replace('ucd', '\\d+', ''))


In [8]:
training.show(5)

+----+---------+---+---+--------------+--------------------+---------+
| ucd|education|sex|age|marital_status|hispanic_race_recode|ucd_short|
+----+---------+---+---+--------------+--------------------+---------+
| F03|       14|  F| 94|             W|                   6|        F|
|J449|       12|  F| 70|             M|                   6|        J|
|I219|       16|  M| 88|             M|                   6|        I|
|C259|       12|  M| 77|             M|                   6|        C|
|F102|       12|  F| 46|             D|                   1|        F|
+----+---------+---+---+--------------+--------------------+---------+
only showing top 5 rows



In [9]:
# replace age coding values: 

def replace(column, value):
    return when(column != value, column)

training = training.withColumn("age", replace(col("age"), 999)) # change age value coded as unknown (999) to null

In [10]:
training.filter(training.age == 999).count() # yes - 999 codes have been replaced 

0

In [11]:
training.groupby("ucd_short").count().orderBy('count', ascending=False).show(5) # top 5 causes of death with counts

+---------+------+
|ucd_short| count|
+---------+------+
|        I|674836|
|        C|477413|
|        J|216611|
|        G|159912|
|        F|111510|
+---------+------+
only showing top 5 rows



In [12]:
training.select([count(when(col(c).isNull(), c)).alias(c) for c in training.columns]).show() # rows with null values in each column

+---+---------+---+---+--------------+--------------------+---------+
|ucd|education|sex|age|marital_status|hispanic_race_recode|ucd_short|
+---+---------+---+---+--------------+--------------------+---------+
|  0|    54142|  0|382|         16592|                7871|        0|
+---+---------+---+---+--------------+--------------------+---------+



In [13]:
training.count() # rows in training set

2200579

In [14]:
# drop rows with null or nans
training = training.na.drop()

In [15]:
training.count()

2136639

In [16]:
(1 - (2136639/2200579))*100 # dropped 2.9% of dataset because of null values

2.9055989355528666

#### Clean and parse data

In [17]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml.feature import OneHotEncoderEstimator
from pyspark.ml import Pipeline

In [18]:
# StringIndexer transformers to encode categorical features as numerical for vector assembly.
# Can't make this work with multiple columns but docs say it's possible.  spark 3.0 difference?

indexer1 = StringIndexer(inputCol="marital_status", outputCol="marital_status_index")
indexer2 = StringIndexer(inputCol="sex", outputCol="sex_index")
indexer3 = StringIndexer(inputCol='ucd_short', outputCol='ucd_short_c')

indexed1 = indexer1.fit(training).transform(training)
indexed2 = indexer2.fit(indexed1).transform(indexed1)
indexed = indexer3.fit(indexed2).transform(indexed2)

In [19]:
indexed.show(10)

+----+---------+---+---+--------------+--------------------+---------+--------------------+---------+-----------+
| ucd|education|sex|age|marital_status|hispanic_race_recode|ucd_short|marital_status_index|sex_index|ucd_short_c|
+----+---------+---+---+--------------+--------------------+---------+--------------------+---------+-----------+
| F03|       14|  F| 94|             W|                   6|        F|                 1.0|      1.0|        4.0|
|J449|       12|  F| 70|             M|                   6|        J|                 0.0|      1.0|        2.0|
|I219|       16|  M| 88|             M|                   6|        I|                 0.0|      0.0|        0.0|
|C259|       12|  M| 77|             M|                   6|        C|                 0.0|      0.0|        1.0|
|F102|       12|  F| 46|             D|                   1|        F|                 2.0|      1.0|        4.0|
|G309|       12|  M| 80|             M|                   6|        G|                 0

In [20]:
# OneHotEncoder transformer to prepare categorical features for model
encoder = OneHotEncoderEstimator(inputCols =["hispanic_race_recode", "education", "marital_status_index"],
                                 outputCols = ["cat_hispanic_race_recode", "cat_education", "cat_marital_status"])


model = encoder.fit(indexed)
encoded = model.transform(indexed)
encoded.show(2)

+----+---------+---+---+--------------+--------------------+---------+--------------------+---------+-----------+------------------------+---------------+------------------+
| ucd|education|sex|age|marital_status|hispanic_race_recode|ucd_short|marital_status_index|sex_index|ucd_short_c|cat_hispanic_race_recode|  cat_education|cat_marital_status|
+----+---------+---+---+--------------+--------------------+---------+--------------------+---------+-----------+------------------------+---------------+------------------+
| F03|       14|  F| 94|             W|                   6|        F|                 1.0|      1.0|        4.0|           (8,[6],[1.0])|(17,[14],[1.0])|     (3,[1],[1.0])|
|J449|       12|  F| 70|             M|                   6|        J|                 0.0|      1.0|        2.0|           (8,[6],[1.0])|(17,[12],[1.0])|     (3,[0],[1.0])|
+----+---------+---+---+--------------+--------------------+---------+--------------------+---------+-----------+-----------------

In [21]:
# VectorAssembler to package the features for model
assembler = VectorAssembler(inputCols=["cat_hispanic_race_recode", "cat_education", "cat_marital_status", "sex_index", "age"],
                            outputCol="features")
output = assembler.transform(encoded)

In [22]:
output.show(2, truncate=False)

+----+---------+---+---+--------------+--------------------+---------+--------------------+---------+-----------+------------------------+---------------+------------------+-------------------------------------------+
|ucd |education|sex|age|marital_status|hispanic_race_recode|ucd_short|marital_status_index|sex_index|ucd_short_c|cat_hispanic_race_recode|cat_education  |cat_marital_status|features                                   |
+----+---------+---+---+--------------+--------------------+---------+--------------------+---------+-----------+------------------------+---------------+------------------+-------------------------------------------+
|F03 |14       |F  |94 |W             |6                   |F        |1.0                 |1.0      |4.0        |(8,[6],[1.0])           |(17,[14],[1.0])|(3,[1],[1.0])     |(30,[6,22,26,28,29],[1.0,1.0,1.0,1.0,94.0])|
|J449|12       |F  |70 |M             |6                   |J        |0.0                 |1.0      |2.0        |(8,[6],[1.0])  

In [23]:
cleanup = output.select("ucd_short_c", "features")

In [24]:
cleanup.printSchema()

root
 |-- ucd_short_c: double (nullable = false)
 |-- features: vector (nullable = true)



In [25]:
train_set, holdout_set = cleanup.randomSplit(weights=[0.8, 0.2], seed=212) ## split into train set and validation set

In [26]:
train_set.groupby("ucd_short_c").count().sort('count').show(23)

+-----------+------+
|ucd_short_c| count|
+-----------+------+
|       22.0|     2|
|       21.0|    86|
|       20.0|   683|
|       19.0|  2984|
|       18.0|  5854|
|       17.0|  6378|
|       16.0|  8518|
|       15.0|  8538|
|       14.0| 10116|
|       13.0| 16721|
|       12.0| 19937|
|       11.0| 25585|
|       10.0| 29744|
|        9.0| 33183|
|        8.0| 43350|
|        7.0| 65422|
|        6.0| 76683|
|        5.0| 78816|
|        4.0| 86255|
|        3.0|125360|
|        2.0|168294|
|        1.0|372737|
|        0.0|523331|
+-----------+------+



In [27]:
# there are only 2 observations for group '22' and 86 for group '21' - filter out for now 

train_set = train_set.filter(train_set.ucd_short_c != 22)
train_set = train_set.filter(train_set.ucd_short_c != 21)

In [28]:
train_set.groupby("ucd_short_c").count().sort('count').show(21)

+-----------+------+
|ucd_short_c| count|
+-----------+------+
|       20.0|   683|
|       19.0|  2984|
|       18.0|  5854|
|       17.0|  6378|
|       16.0|  8518|
|       15.0|  8538|
|       14.0| 10116|
|       13.0| 16721|
|       12.0| 19937|
|       11.0| 25585|
|       10.0| 29744|
|        9.0| 33183|
|        8.0| 43350|
|        7.0| 65422|
|        6.0| 76683|
|        5.0| 78816|
|        4.0| 86255|
|        3.0|125360|
|        2.0|168294|
|        1.0|372737|
|        0.0|523331|
+-----------+------+



In [29]:
train_set.select('ucd_short_c').distinct().count()

21

There are now 21 different outcomes for UCD. 

#### Balance training data with downsampling

In [30]:
# write function to do downsampling

def downSample(df, target, seed):
    
    # gather counts of each class 
    class_counts = df.groupby(target).count()

    # select smallest count size and corresponding class
    smallest_class_size = class_counts.agg({'count': 'min'})
    smallest_class_size = smallest_class_size.collect()[0]['min(count)']

    # generate ratio of each class to smallest class - for use with .sample()
    class_counts = class_counts.withColumn('min', lit(smallest_class_size))
    class_counts = class_counts.withColumn('ratio', class_counts['min']/ class_counts['count'])

    smallest_class = class_counts.filter(class_counts['count'] == class_counts['min']).collect()[0][target]
    
    # set up final dataframe to hold results - with only the smallest class to start
    adjusted_df = df.filter(df[target] == smallest_class)

    # iterate over outcome classes, sampling to match count of smallest class
    for i in range(class_counts.count()):

        outcome_class = class_counts.collect()[i][target]
        ratio = class_counts.collect()[i]['ratio']

        if outcome_class != smallest_class: 

            subset = df.filter(df[target] == outcome_class)
            subset_adjusted = subset.sample(False, ratio, seed = seed)

            adjusted_df = adjusted_df.unionAll(subset_adjusted)
            
        else:
            adjusted_df = adjusted_df

    return adjusted_df

In [31]:
adj_train = downSample(df = train_set, target = 'ucd_short_c', seed = 4)
adj_train.groupby("ucd_short_c").count().sort('count').show(21)

+-----------+-----+
|ucd_short_c|count|
+-----------+-----+
|        1.0|  658|
|        0.0|  665|
|       20.0|  683|
|        2.0|  684|
|       17.0|  698|
|       12.0|  700|
|        3.0|  704|
|       18.0|  706|
|       19.0|  706|
|       13.0|  706|
|       11.0|  713|
|       14.0|  717|
|       16.0|  719|
|       15.0|  720|
|        9.0|  729|
|       10.0|  734|
|        7.0|  736|
|        8.0|  739|
|        4.0|  743|
|        6.0|  746|
|        5.0|  747|
+-----------+-----+



### Modeling

https://spark.apache.org/docs/2.4.7/ml-classification-regression.html

#### Decision Tree Model

In [32]:
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(labelCol="ucd_short_c", featuresCol="features")

In [33]:
# dt_model = dt.fit(train_set)
dt_model = dt.fit(adj_train)

In [34]:
# train_output = dt_model.transform(train_set)
train_output = dt_model.transform(adj_train)

In [35]:
# train_output.show(10)

In [36]:
# Model summary

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="ucd_short_c", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(train_output)

In [37]:
print('Accuracy: ' + str(accuracy*100) + '%')
print('Test Error: ' + str((1.0 - accuracy)*100) + '%')

Accuracy: 19.454290109008227%
Test Error: 80.54570989099177%


In [38]:
print(dt_model)

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_5995b234550c) of depth 5 with 45 nodes


In [39]:
# train_output.groupby("prediction").count().show()

In [40]:
predictions = dt_model.transform(holdout_set)
# predictions.show(10)
predictions.count()

428062

In [41]:
predictions.groupby("prediction").count().sort('count').show()

+----------+------+
|prediction| count|
+----------+------+
|       7.0|   524|
|      11.0|   773|
|      18.0|  3779|
|      17.0|  5882|
|      20.0| 18047|
|       5.0| 30901|
|      16.0| 34058|
|       1.0| 45158|
|      14.0| 52984|
|       8.0| 62618|
|       2.0| 62772|
|       4.0|110566|
+----------+------+



##### False-Positive and True-Positive Rates:

In [42]:
correct_pred = predictions.filter(predictions['ucd_short_c'] == predictions['prediction'])
incorrect_pred = predictions.filter(predictions['ucd_short_c'] != predictions['prediction'])

In [43]:
tpr = (correct_pred.count()) / (predictions.count())
fpr = (incorrect_pred.count()) / (predictions.count())

In [44]:
from builtins import round
print('True positive rate: ' + str(round((tpr*100), 3)) + '%')
print('False positive rate: ' + str(round((fpr*100), 3)) + '%')

True positive rate: 11.235%
False positive rate: 88.765%


#### Random Forest Model

In [45]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol='ucd_short_c', featuresCol="features", numTrees=500)

In [46]:
# rf_model = rf.fit(train_set)
rf_model = rf.fit(adj_train)

In [47]:
# train_output = rf_model.transform(train_set)
train_output = rf_model.transform(adj_train)

In [48]:
# Model summary

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="ucd_short_c", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(train_output)

In [49]:
print('Accuracy: ' + str(accuracy*100) + '%')
print('Test Error: ' + str((1.0 - accuracy)*100) + '%')

Accuracy: 19.88898548786197%
Test Error: 80.11101451213804%


In [50]:
print(rf_model)

RandomForestClassificationModel (uid=RandomForestClassifier_111b87f80cfd) with 500 trees


In [51]:
# train_output.groupby("prediction").count().show()

In [52]:
predictions = rf_model.transform(holdout_set)
# predictions.show(10)
predictions.count()

428062

In [53]:
predictions.groupby("prediction").count().sort('count').show()

+----------+------+
|prediction| count|
+----------+------+
|      12.0|   309|
|       0.0|   524|
|       9.0|  1426|
|      19.0|  1564|
|      13.0|  1682|
|       3.0|  3543|
|       6.0|  3804|
|      11.0|  4759|
|       2.0|  6933|
|      18.0|  7103|
|      17.0|  9618|
|       8.0| 10965|
|       7.0| 17138|
|      10.0| 18286|
|      20.0| 28775|
|      16.0| 38783|
|      14.0| 40857|
|       5.0| 46991|
|       1.0| 56673|
|       4.0|128329|
+----------+------+



##### False-Positive and True-Positive Rates:

In [54]:
correct_pred = predictions.filter(predictions['ucd_short_c'] == predictions['prediction'])
incorrect_pred = predictions.filter(predictions['ucd_short_c'] != predictions['prediction'])

In [55]:
tpr = (correct_pred.count()) / (predictions.count())
fpr = (incorrect_pred.count()) / (predictions.count())

In [56]:
print('True positive rate: ' + str(round((tpr*100), 3)) + '%')
print('False positive rate: ' + str(round((fpr*100), 3)) + '%')

True positive rate: 10.594%
False positive rate: 89.406%


## Hold-out Testing Data 

In [57]:
# Read in train data
test_data = spark.read.option("header",True).option("inferSchema",True).csv("test_data")
test_data.count()

550650

In [58]:
# subset with potential features
testing = test_data.select("ucd", "education", "sex", "age", "marital_status", "hispanic_race_recode")

In [59]:
testing.printSchema()

root
 |-- ucd: string (nullable = true)
 |-- education: integer (nullable = true)
 |-- sex: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- hispanic_race_recode: integer (nullable = true)



In [60]:
# strip UCD codes to include first letter only
from pyspark.sql.functions import *

testing = testing.withColumn("ucd_short", regexp_replace('ucd', '\\d+', ''))

In [61]:
testing.show(5)

+----+---------+---+---+--------------+--------------------+---------+
| ucd|education|sex|age|marital_status|hispanic_race_recode|ucd_short|
+----+---------+---+---+--------------+--------------------+---------+
| C61|        1|  M| 82|             M|                   7|        C|
|E142|        1|  F| 74|             M|                   1|        E|
|I429|        2|  M| 70|             S|                   8|        I|
|C349|        3|  M| 89|             W|                   6|        C|
|I119|        3|  M| 67|             S|                   7|        I|
+----+---------+---+---+--------------+--------------------+---------+
only showing top 5 rows



In [62]:
# replace age coding values: 

def replace(column, value):
    return when(column != value, column)

testing = testing.withColumn("age", replace(col("age"), 999)) # change age value coded as unknown (999) to null

In [63]:
testing.filter(testing.age == 999).count() # yes - 999 codes have been replaced 

0

In [64]:
testing.groupby("ucd_short").count().orderBy('count', ascending=False).show(5) # top 5 causes of death with counts

+---------+------+
|ucd_short| count|
+---------+------+
|        I|168944|
|        C|119737|
|        J| 54616|
|        G| 39785|
|        F| 27609|
+---------+------+
only showing top 5 rows



In [65]:
testing.select([count(when(col(c).isNull(), c)).alias(c) for c in testing.columns]).show() # rows with null values in each column

+---+---------+---+---+--------------+--------------------+---------+
|ucd|education|sex|age|marital_status|hispanic_race_recode|ucd_short|
+---+---------+---+---+--------------+--------------------+---------+
|  0|    13521|  0|102|          4215|                1942|        0|
+---+---------+---+---+--------------+--------------------+---------+



In [66]:
testing.count() # rows in training set

550650

In [67]:
# drop rows with null or nans
testing = testing.na.drop()

In [68]:
testing.count()

534741

In [69]:
(1 - (534741/550650))*100 # dropped 2.9% of testing dataset because of null values

2.889131026968128

#### Clean and parse data

In [70]:
# StringIndexer transformers to encode categorical features as numerical for vector assembly.
# Can't make this work with multiple columns but docs say it's possible.  spark 3.0 difference?

indexer1 = StringIndexer(inputCol="marital_status", outputCol="marital_status_index")
indexer2 = StringIndexer(inputCol="sex", outputCol="sex_index")
indexer3 = StringIndexer(inputCol='ucd_short', outputCol='ucd_short_c')

indexed1 = indexer1.fit(testing).transform(testing)
indexed2 = indexer2.fit(indexed1).transform(indexed1)
indexed = indexer3.fit(indexed2).transform(indexed2)

In [71]:
indexed.show(10)

+----+---------+---+---+--------------+--------------------+---------+--------------------+---------+-----------+
| ucd|education|sex|age|marital_status|hispanic_race_recode|ucd_short|marital_status_index|sex_index|ucd_short_c|
+----+---------+---+---+--------------+--------------------+---------+--------------------+---------+-----------+
| C61|        1|  M| 82|             M|                   7|        C|                 0.0|      0.0|        1.0|
|E142|        1|  F| 74|             M|                   1|        E|                 0.0|      1.0|        6.0|
|I429|        2|  M| 70|             S|                   8|        I|                 3.0|      0.0|        0.0|
|C349|        3|  M| 89|             W|                   6|        C|                 1.0|      0.0|        1.0|
|I119|        3|  M| 67|             S|                   7|        I|                 3.0|      0.0|        0.0|
|J841|        3|  M| 87|             M|                   6|        J|                 0

In [72]:
# OneHotEncoder transformer to prepare categorical features for model
encoder = OneHotEncoderEstimator(inputCols =["hispanic_race_recode", "education", "marital_status_index"],
                                 outputCols = ["cat_hispanic_race_recode", "cat_education", "cat_marital_status"])


model = encoder.fit(indexed)
encoded = model.transform(indexed)
encoded.show(2)

+----+---------+---+---+--------------+--------------------+---------+--------------------+---------+-----------+------------------------+--------------+------------------+
| ucd|education|sex|age|marital_status|hispanic_race_recode|ucd_short|marital_status_index|sex_index|ucd_short_c|cat_hispanic_race_recode| cat_education|cat_marital_status|
+----+---------+---+---+--------------+--------------------+---------+--------------------+---------+-----------+------------------------+--------------+------------------+
| C61|        1|  M| 82|             M|                   7|        C|                 0.0|      0.0|        1.0|           (8,[7],[1.0])|(17,[1],[1.0])|     (3,[0],[1.0])|
|E142|        1|  F| 74|             M|                   1|        E|                 0.0|      1.0|        6.0|           (8,[1],[1.0])|(17,[1],[1.0])|     (3,[0],[1.0])|
+----+---------+---+---+--------------+--------------------+---------+--------------------+---------+-----------+----------------------

In [73]:
# VectorAssembler to package the features for model
assembler = VectorAssembler(inputCols=["cat_hispanic_race_recode", "cat_education", "cat_marital_status", "sex_index", "age"],
                            outputCol="features")
output = assembler.transform(encoded)

In [74]:
output.show(2, truncate=False)

+----+---------+---+---+--------------+--------------------+---------+--------------------+---------+-----------+------------------------+--------------+------------------+------------------------------------------+
|ucd |education|sex|age|marital_status|hispanic_race_recode|ucd_short|marital_status_index|sex_index|ucd_short_c|cat_hispanic_race_recode|cat_education |cat_marital_status|features                                  |
+----+---------+---+---+--------------+--------------------+---------+--------------------+---------+-----------+------------------------+--------------+------------------+------------------------------------------+
|C61 |1        |M  |82 |M             |7                   |C        |0.0                 |0.0      |1.0        |(8,[7],[1.0])           |(17,[1],[1.0])|(3,[0],[1.0])     |(30,[7,9,25,29],[1.0,1.0,1.0,82.0])       |
|E142|1        |F  |74 |M             |1                   |E        |0.0                 |1.0      |6.0        |(8,[1],[1.0])          

In [75]:
cleanup = output.select("ucd_short_c", "features")

In [76]:
cleanup.printSchema()

root
 |-- ucd_short_c: double (nullable = false)
 |-- features: vector (nullable = true)



In [77]:
cleanup.groupby("ucd_short_c").count().sort('count').show(23)

+-----------+------+
|ucd_short_c| count|
+-----------+------+
|       21.0|    21|
|       20.0|   209|
|       19.0|   951|
|       18.0|  1745|
|       17.0|  1990|
|       16.0|  2649|
|       15.0|  2729|
|       14.0|  3139|
|       13.0|  5207|
|       12.0|  6281|
|       11.0|  8049|
|       10.0|  9225|
|        9.0| 10509|
|        8.0| 13731|
|        7.0| 20263|
|        6.0| 23831|
|        5.0| 24710|
|        4.0| 26735|
|        3.0| 39016|
|        2.0| 53003|
|        1.0|116913|
|        0.0|163835|
+-----------+------+



#### If using downsampling function:

In [78]:
# filter out groups '20' and '21'

cleanup = cleanup.filter(cleanup.ucd_short_c != 22)
cleanup = cleanup.filter(cleanup.ucd_short_c != 21)

cleanup.groupby("ucd_short_c").count().sort('count').show(21)

+-----------+------+
|ucd_short_c| count|
+-----------+------+
|       20.0|   209|
|       19.0|   951|
|       18.0|  1745|
|       17.0|  1990|
|       16.0|  2649|
|       15.0|  2729|
|       14.0|  3139|
|       13.0|  5207|
|       12.0|  6281|
|       11.0|  8049|
|       10.0|  9225|
|        9.0| 10509|
|        8.0| 13731|
|        7.0| 20263|
|        6.0| 23831|
|        5.0| 24710|
|        4.0| 26735|
|        3.0| 39016|
|        2.0| 53003|
|        1.0|116913|
|        0.0|163835|
+-----------+------+



### Modeling - Holdout Set

https://spark.apache.org/docs/2.4.7/ml-classification-regression.html

#### Decision Tree Model

In [79]:
# train_output = dt_model.transform(train_set)
holdout_preds = dt_model.transform(cleanup)

In [80]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="ucd_short_c", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(holdout_preds)

In [81]:
print('Accuracy: ' + str(accuracy*100) + '%')
print('Test Error: ' + str((1.0 - accuracy)*100) + '%')

Accuracy: 11.165095751047277%
Test Error: 88.83490424895272%


In [82]:
holdout_preds.count()

534720

In [83]:
holdout_preds.groupby("prediction").count().sort('count').show()

+----------+------+
|prediction| count|
+----------+------+
|       7.0|   605|
|      11.0|   900|
|      18.0|  4630|
|      17.0|  7279|
|      20.0| 22719|
|       5.0| 38853|
|      16.0| 42686|
|       1.0| 56597|
|      14.0| 66556|
|       2.0| 77740|
|       8.0| 78291|
|       4.0|137864|
+----------+------+



##### False-Positive and True-Positive Rates:

In [84]:
correct_pred = holdout_preds.filter(holdout_preds['ucd_short_c'] == holdout_preds['prediction'])
incorrect_pred = holdout_preds.filter(holdout_preds['ucd_short_c'] != holdout_preds['prediction'])

In [85]:
tpr = (correct_pred.count()) / (holdout_preds.count())
fpr = (incorrect_pred.count()) / (holdout_preds.count())

In [86]:
from builtins import round
print('True positive rate: ' + str(round((tpr*100), 3)) + '%')
print('False positive rate: ' + str(round((fpr*100), 3)) + '%')

True positive rate: 11.165%
False positive rate: 88.835%


#### Random Forest Model

In [87]:
# train_output = rf_model.transform(train_set)
holdout_preds = rf_model.transform(cleanup)

In [88]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="ucd_short_c", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(holdout_preds)

In [89]:
print('Accuracy: ' + str(accuracy*100) + '%')
print('Test Error: ' + str((1.0 - accuracy)*100) + '%')

Accuracy: 10.367295032914422%
Test Error: 89.63270496708557%


In [90]:
holdout_preds.count()

534720

In [91]:
holdout_preds.groupby("prediction").count().sort('count').show()

+----------+------+
|prediction| count|
+----------+------+
|      12.0|   367|
|       0.0|   619|
|       9.0|  1820|
|      19.0|  1944|
|      13.0|  2133|
|       3.0|  4411|
|       6.0|  4696|
|      11.0|  5867|
|       2.0|  8706|
|      18.0|  8916|
|      17.0| 11866|
|       8.0| 13724|
|       7.0| 21310|
|      10.0| 23033|
|      20.0| 36138|
|      16.0| 48455|
|      14.0| 52033|
|       5.0| 59483|
|       1.0| 69884|
|       4.0|159315|
+----------+------+



##### False-Positive and True-Positive Rates:

In [92]:
correct_pred = holdout_preds.filter(holdout_preds['ucd_short_c'] == holdout_preds['prediction'])
incorrect_pred = holdout_preds.filter(holdout_preds['ucd_short_c'] != holdout_preds['prediction'])

In [93]:
tpr = (correct_pred.count()) / (holdout_preds.count())
fpr = (incorrect_pred.count()) / (holdout_preds.count())

In [94]:
print('True positive rate: ' + str(round((tpr*100), 3)) + '%')
print('False positive rate: ' + str(round((fpr*100), 3)) + '%')

True positive rate: 10.367%
False positive rate: 89.633%
