### DS5559 Final Project

#### Predicting Primary Cause of Death from Demographic Features

No downsampling*

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
.master("local[*]") \
.appName("spark_setup") \
.getOrCreate()

In [3]:
# Read in train data
train_data = spark.read.option("header",True).option("inferSchema",True).csv("train_data")
train_data.count()

2200579

In [4]:
# subset with potential features
training = train_data.select("ucd", "education", "sex", "age", "marital_status", "hispanic_race_recode")

In [5]:
# strip UCD codes to include first letter only
from pyspark.sql.functions import *

training = training.withColumn("ucd_short", regexp_replace('ucd', '\\d+', ''))


In [6]:
# replace age coding values: 
def replace(column, value):
    return when(column != value, column)

training = training.withColumn("age", replace(col("age"), 999)) # change age value coded as unknown (999) to null

In [7]:
training.filter(training.age == 999).count() # yes - 999 codes have been replaced 

0

In [8]:
training.groupby("ucd_short").count().orderBy('count', ascending=False).show(5) # top 5 causes of death with counts

+---------+------+
|ucd_short| count|
+---------+------+
|        I|674836|
|        C|477413|
|        J|216611|
|        G|159912|
|        F|111510|
+---------+------+
only showing top 5 rows



In [9]:
training.select([count(when(col(c).isNull(), c)).alias(c) for c in training.columns]).show() # rows with null values in each column

+---+---------+---+---+--------------+--------------------+---------+
|ucd|education|sex|age|marital_status|hispanic_race_recode|ucd_short|
+---+---------+---+---+--------------+--------------------+---------+
|  0|    54142|  0|382|         16592|                7871|        0|
+---+---------+---+---+--------------+--------------------+---------+



In [10]:
training.count() # rows in training set

2200579

In [11]:
# drop rows with null or nans
training = training.na.drop()

In [12]:
training.count()

2136639

In [13]:
(1 - (2136639/2200579))*100 # dropped 2.9% of dataset because of null values

2.9055989355528666

#### Clean and parse data

In [14]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml.feature import OneHotEncoderEstimator
from pyspark.ml import Pipeline

In [15]:
# StringIndexer transformers to encode categorical features as numerical for vector assembly.
# Can't make this work with multiple columns but docs say it's possible.  spark 3.0 difference?

indexer1 = StringIndexer(inputCol="marital_status", outputCol="marital_status_index")
indexer2 = StringIndexer(inputCol="sex", outputCol="sex_index")
indexer3 = StringIndexer(inputCol='ucd_short', outputCol='ucd_short_c')

indexed1 = indexer1.fit(training).transform(training)
indexed2 = indexer2.fit(indexed1).transform(indexed1)
indexed = indexer3.fit(indexed2).transform(indexed2)

In [16]:
# indexed.show(5)

In [17]:
# save UCD mappings 
ucd_xref = indexed.select('ucd_short_c', 'ucd_short').distinct()
ucd_xref.show(25)

+-----------+---------+
|ucd_short_c|ucd_short|
+-----------+---------+
|        0.0|        I|
|        6.0|        E|
|        1.0|        C|
|       21.0|        H|
|        3.0|        G|
|       20.0|        O|
|        5.0|        X|
|        9.0|        A|
|       14.0|        B|
|        2.0|        J|
|       13.0|        D|
|        4.0|        F|
|       19.0|        L|
|       11.0|        V|
|       10.0|        W|
|       18.0|        Q|
|       22.0|        U|
|       12.0|        R|
|        7.0|        K|
|       15.0|        Y|
|        8.0|        N|
|       17.0|        P|
|       16.0|        M|
+-----------+---------+



In [18]:
# OneHotEncoder transformer to prepare categorical features for model
encoder = OneHotEncoderEstimator(inputCols =["hispanic_race_recode", "education", "marital_status_index"],
                                 outputCols = ["cat_hispanic_race_recode", "cat_education", "cat_marital_status"])


model = encoder.fit(indexed)
encoded = model.transform(indexed)

In [19]:
# VectorAssembler to package the features for model
assembler = VectorAssembler(inputCols=["cat_hispanic_race_recode", "cat_education", "cat_marital_status", "sex_index", "age"],
                            outputCol="features")
output = assembler.transform(encoded)

In [20]:
cleanup = output.select("ucd_short_c", "features")

In [21]:
train_set, holdout_set = cleanup.randomSplit(weights=[0.8, 0.2], seed=212) ## split into train set and validation set

In [22]:
train_set.groupby("ucd_short_c").count().sort('count').show(5)

+-----------+-----+
|ucd_short_c|count|
+-----------+-----+
|       22.0|    2|
|       21.0|   91|
|       20.0|  698|
|       19.0| 2992|
|       18.0| 5881|
+-----------+-----+
only showing top 5 rows



In [23]:
# there are only 2 observations for group '22' and 91 for group '21' - filter out for now 

train_set = train_set.filter(train_set.ucd_short_c != 22)
train_set = train_set.filter(train_set.ucd_short_c != 21)

In [24]:
train_set.groupby("ucd_short_c").count().sort('count').show(5)

+-----------+-----+
|ucd_short_c|count|
+-----------+-----+
|       20.0|  698|
|       19.0| 2992|
|       18.0| 5881|
|       17.0| 6413|
|       16.0| 8471|
+-----------+-----+
only showing top 5 rows



In [25]:
train_set.select('ucd_short_c').distinct().count()

21

There are now 21 different outcomes for UCD. 

### Modeling

https://spark.apache.org/docs/2.4.7/ml-classification-regression.html

#### Decision Tree Model

In [25]:
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(labelCol="ucd_short_c", featuresCol="features")

In [26]:
dt_model = dt.fit(train_set)

In [27]:
dt_train_output = dt_model.transform(train_set)

In [28]:
# Model summary

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="ucd_short_c", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(dt_train_output)

In [29]:
print('DT Accuracy: ' + str(accuracy*100) + '%')
# print('DT Test Error: ' + str((1.0 - accuracy)*100) + '%')

DT Accuracy: 34.7911775872387%


In [30]:
print(dt_model)

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_52c2d7a5328c) of depth 5 with 39 nodes


In [31]:
# visualize

# # save model
# dt_model.save('home/ds5559/final_project/models/dt_model')
# modeldf = spark.read.parquet('home/ds5559/final_project/models/dt_model/'+"data/*")

In [32]:
# # visualize decision tree
# noderows = modeldf.select("id","prediction","leftChild","rightChild","split").collect()
# df = pd.Dataframe([[rw['id'],rw['gain],rw['impurity'],rw['gini']] for rw in noderows if rw['leftChild'] < 0 and rw['rightChild'] < 0])
# df.show()

In [33]:
dt_predictions = dt_model.transform(holdout_set)

In [34]:
# dt_predictions.groupby("prediction").count().sort('count').show()

In [35]:
dt_pred_accuracy = evaluator.evaluate(dt_predictions)

In [36]:
print('DT Validation Hold-out Set Prediction Accuracy: ' + str(dt_pred_accuracy*100) + '%')
# print('DT Hold-out Set Test Error: ' + str((1.0 - dt_pred_accuracy)*100) + '%')

DT Validation Hold-out Set Prediction Accuracy: 34.82968765730099%


#### Random Forest Model

In [37]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol='ucd_short_c', featuresCol="features", numTrees=100)

In [38]:
rf_model = rf.fit(train_set)

In [39]:
rf_train_output = rf_model.transform(train_set)

In [40]:
# Model summary

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="ucd_short_c", predictionCol="prediction", metricName="accuracy")

rf_accuracy = evaluator.evaluate(rf_train_output)

In [41]:
print('RF Accuracy: ' + str(rf_accuracy*100) + '%')
# print('RF Test Error: ' + str((1.0 - rf_accuracy)*100) + '%')

RF Accuracy: 34.287673180185784%


In [42]:
print(rf_model)

RandomForestClassificationModel (uid=RandomForestClassifier_3f787b98fc9e) with 100 trees


In [43]:
rf_predictions = rf_model.transform(holdout_set)

In [44]:
# rf_predictions.groupby("prediction").count().sort('count').show()

In [45]:
rf_pred_accuracy = evaluator.evaluate(rf_predictions)

In [46]:
print('RF Validation Hold-out Set Prediction Accuracy: ' + str(rf_pred_accuracy*100) + '%')
# print('RF Hold-out Set Test Error: ' + str((1.0 - rf_pred_accuracy)*100) + '%')

RF Validation Hold-out Set Prediction Accuracy: 34.27950000819428%


#### Naive Bayes Model

In [47]:
from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes(labelCol='ucd_short_c', featuresCol="features", smoothing=1.0, modelType="multinomial")

In [48]:
nb_model = nb.fit(train_set)

In [49]:
nb_train_output = nb_model.transform(train_set)

In [50]:
# Model summary

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="ucd_short_c", predictionCol="prediction", metricName="accuracy")

nb_accuracy = evaluator.evaluate(nb_train_output)

In [51]:
print('NB Accuracy: ' + str(nb_accuracy*100) + '%')
# print('RF Test Error: ' + str((1.0 - rf_accuracy)*100) + '%')

NB Accuracy: 31.846024877458362%


In [52]:
print(nb_model)

NaiveBayes_4ba42fab39b4


In [53]:
nb_predictions = nb_model.transform(holdout_set)

In [54]:
# rf_predictions.groupby("prediction").count().sort('count').show()

In [55]:
nb_pred_accuracy = evaluator.evaluate(nb_predictions)

In [56]:
print('NB Validation Hold-out Set Prediction Accuracy: ' + str(nb_pred_accuracy*100) + '%')
# print('RF Hold-out Set Test Error: ' + str((1.0 - rf_pred_accuracy)*100) + '%')

NB Validation Hold-out Set Prediction Accuracy: 31.85071419039302%


#### Logistic Regression Model

In [57]:
from pyspark.ml.classification import LogisticRegression

mlr = LogisticRegression(labelCol='ucd_short_c', featuresCol="features", family = "multinomial")

In [58]:
mlr_model = mlr.fit(train_set)

In [59]:
# print(str(mlr_model.coefficientMatrix))

In [60]:
mlr_train_output = mlr_model.transform(train_set)

In [61]:
# Model summary

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="ucd_short_c", predictionCol="prediction", metricName="accuracy")

mlr_accuracy = evaluator.evaluate(mlr_train_output)

In [62]:
print('MLR Accuracy: ' + str(mlr_accuracy*100) + '%')
# print('MLR Test Error: ' + str((1.0 - mlr_accuracy)*100) + '%')

MLR Accuracy: 34.532376205014685%


In [63]:
print(mlr_model)

LogisticRegressionModel: uid = LogisticRegression_a07e431f7742, numClasses = 23, numFeatures = 30


In [64]:
mlr_predictions = mlr_model.transform(holdout_set)

In [65]:
# mlr_predictions.groupby("prediction").count().sort('count').show()

In [66]:
mlr_pred_accuracy = evaluator.evaluate(mlr_predictions)

In [67]:
print('MLR Validation Hold-out Set Prediction Accuracy: ' + str(mlr_pred_accuracy*100) + '%')
# print('MLR Hold-out Set Test Error: ' + str((1.0 - mlr_pred_accuracy)*100) + '%')

MLR Validation Hold-out Set Prediction Accuracy: 34.540312366111245%


### Final Hold-Out Test Set 

In [68]:
# Read in test data
test_data = spark.read.option("header",True).option("inferSchema",True).csv("test_data")
test_data.count()

550650

In [69]:
# subset with potential features
testing = test_data.select("ucd", "education", "sex", "age", "marital_status", "hispanic_race_recode")

In [70]:
# strip UCD codes to include first letter only
testing = testing.withColumn("ucd_short", regexp_replace('ucd', '\\d+', ''))

In [71]:
testing.show(5)

+----+---------+---+---+--------------+--------------------+---------+
| ucd|education|sex|age|marital_status|hispanic_race_recode|ucd_short|
+----+---------+---+---+--------------+--------------------+---------+
| C61|        1|  M| 82|             M|                   7|        C|
|E142|        1|  F| 74|             M|                   1|        E|
|I429|        2|  M| 70|             S|                   8|        I|
|C349|        3|  M| 89|             W|                   6|        C|
|I119|        3|  M| 67|             S|                   7|        I|
+----+---------+---+---+--------------+--------------------+---------+
only showing top 5 rows



In [72]:
# replace age coding values: 
testing = testing.withColumn("age", replace(col("age"), 999)) # change age value coded as unknown (999) to null

In [73]:
# drop rows with null or nans
testing = testing.na.drop()

In [74]:
testing.count()

534741

#### Clean and parse data

In [75]:
# join with ucd_short_c cross reference - want to ensure that string indexing matches between train and test data!
testing = testing.join(ucd_xref, on = 'ucd_short')

In [76]:
testing.show(5)

+---------+----+---------+---+---+--------------+--------------------+-----------+
|ucd_short| ucd|education|sex|age|marital_status|hispanic_race_recode|ucd_short_c|
+---------+----+---------+---+---+--------------+--------------------+-----------+
|        K|K922|        3|  F| 94|             W|                   6|        7.0|
|        K|K746|        2|  M| 72|             M|                   6|        7.0|
|        K|K650|        3|  M| 86|             W|                   6|        7.0|
|        K|K922|        3|  F| 81|             W|                   6|        7.0|
|        K|K746|        3|  F| 72|             M|                   6|        7.0|
+---------+----+---------+---+---+--------------+--------------------+-----------+
only showing top 5 rows



In [77]:
# StringIndexer transformers to encode categorical features as numerical for vector assembly.
# leaving off the ucd indexer here - already joined with ucd_xref

indexed1 = indexer1.fit(testing).transform(testing)
indexed = indexer2.fit(indexed1).transform(indexed1)

In [78]:
indexed.show(2)

+---------+----+---------+---+---+--------------+--------------------+-----------+--------------------+---------+
|ucd_short| ucd|education|sex|age|marital_status|hispanic_race_recode|ucd_short_c|marital_status_index|sex_index|
+---------+----+---------+---+---+--------------+--------------------+-----------+--------------------+---------+
|        K|K922|        3|  F| 94|             W|                   6|        7.0|                 1.0|      1.0|
|        K|K746|        2|  M| 72|             M|                   6|        7.0|                 0.0|      0.0|
+---------+----+---------+---+---+--------------+--------------------+-----------+--------------------+---------+
only showing top 2 rows



In [79]:
# OneHotEncoder transformer to prepare categorical features for model
model = encoder.fit(indexed)
encoded = model.transform(indexed)

In [80]:
# VectorAssembler to package the features for model
output = assembler.transform(encoded)

In [81]:
cleanup = output.select("ucd_short_c", "features")

In [82]:
cleanup.groupby("ucd_short_c").count().sort('count').show(5)

+-----------+-----+
|ucd_short_c|count|
+-----------+-----+
|       21.0|   21|
|       20.0|  209|
|       19.0|  951|
|       18.0| 1745|
|       17.0| 1990|
+-----------+-----+
only showing top 5 rows



In [83]:
# filter out groups '20' and '21' (consistent with training data pre-processing)

cleanup = cleanup.filter(cleanup.ucd_short_c != 22)
cleanup = cleanup.filter(cleanup.ucd_short_c != 21)

### Modeling - Holdout Set

#### Decision Tree

In [84]:
dt_predictions_f = dt_model.transform(cleanup)

In [85]:
dt_pred_accuracy_f = evaluator.evaluate(dt_predictions_f)

In [86]:
print('DT Final Hold-out Set Prediction Accuracy: ' + str(dt_pred_accuracy_f*100) + '%')
# print('DT Hold-out Set Test Error: ' + str((1.0 - dt_pred_accuracy)*100) + '%')

DT Final Hold-out Set Prediction Accuracy: 34.783438061041295%


#### Random Forest

In [87]:
rf_predictions_f = rf_model.transform(cleanup)

In [88]:
rf_pred_accuracy_f = evaluator.evaluate(rf_predictions_f)

In [89]:
print('RF Final Hold-out Set Prediction Accuracy: ' + str(rf_pred_accuracy_f*100) + '%')
# print('DT Hold-out Set Test Error: ' + str((1.0 - dt_pred_accuracy)*100) + '%')

RF Final Hold-out Set Prediction Accuracy: 34.31291143028127%


#### Naive Bayes

In [90]:
nb_predictions_f = nb_model.transform(cleanup)

In [91]:
nb_pred_accuracy_f = evaluator.evaluate(nb_predictions_f)

In [92]:
print('NB Validation Hold-out Set Prediction Accuracy: ' + str(nb_pred_accuracy_f*100) + '%')
# print('MLR Hold-out Set Test Error: ' + str((1.0 - mlr_pred_accuracy)*100) + '%')

NB Validation Hold-out Set Prediction Accuracy: 31.813098444045483%


#### Multinomial Logistic Regression

In [93]:
mlr_predictions_f = mlr_model.transform(cleanup)

In [94]:
mlr_pred_accuracy_f = evaluator.evaluate(mlr_predictions_f)

In [95]:
print('MLR Validation Hold-out Set Prediction Accuracy: ' + str(mlr_pred_accuracy_f*100) + '%')
# print('MLR Hold-out Set Test Error: ' + str((1.0 - mlr_pred_accuracy)*100) + '%')

MLR Validation Hold-out Set Prediction Accuracy: 34.5384500299222%


### Additional Statistics

Multinomial Linear Regression 

In [96]:
mlr_summary = mlr_model.summary

In [97]:
accuracy = mlr_summary.accuracy
falsePositiveRate = mlr_summary.weightedFalsePositiveRate
truePositiveRate = mlr_summary.weightedTruePositiveRate
fMeasure = mlr_summary.weightedFMeasure()
precision = mlr_summary.weightedPrecision
recall = mlr_summary.weightedRecall

In [98]:
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

Accuracy: 0.34532376205014687
FPR: 0.251448908622885
TPR: 0.3453237620501469
F-measure: 0.2436907510505344
Precision: 0.22353801080339267
Recall: 0.3453237620501469


In [26]:
# Save notebook as PDF document
!jupyter nbconvert --to pdf `pwd`/*.ipynb

[NbConvertApp] Converting notebook /sfs/qumulo/qhome/thd6tp/ds5559/final_project/Classification_Models.ipynb to pdf
[NbConvertApp] Writing 85997 bytes to ./notebook.tex
[NbConvertApp] Building PDF
[NbConvertApp] Running xelatex 3 times: ['xelatex', './notebook.tex', '-quiet']
[NbConvertApp] Running bibtex 1 time: ['bibtex', './notebook']
[NbConvertApp] PDF successfully created
[NbConvertApp] Writing 75070 bytes to /sfs/qumulo/qhome/thd6tp/ds5559/final_project/Classification_Models.pdf
[NbConvertApp] Converting notebook /sfs/qumulo/qhome/thd6tp/ds5559/final_project/Classification_Models_DS.ipynb to pdf
[NbConvertApp] Writing 98157 bytes to ./notebook.tex
[NbConvertApp] Building PDF
[NbConvertApp] Running xelatex 3 times: ['xelatex', './notebook.tex', '-quiet']
[NbConvertApp] Running bibtex 1 time: ['bibtex', './notebook']
[NbConvertApp] PDF successfully created
[NbConvertApp] Writing 83886 bytes to /sfs/qumulo/qhome/thd6tp/ds5559/final_project/Classification_Models_DS.pdf
[NbConvertApp]