### DS5559 Final Project

#### Predicting Primary Cause of Death from Demographic Features

Downsampled data to balance primary cause of death class distribution

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
.master("local[*]") \
.appName("spark_setup") \
.getOrCreate()

In [2]:
# Read in train data
train_data = spark.read.option("header",True).option("inferSchema",True).csv("train_data")
train_data.count()

2200579

In [3]:
# subset with potential features
training = train_data.select("ucd", "education", "sex", "age", "marital_status", "hispanic_race_recode")

In [4]:
# strip UCD codes to include first letter only
from pyspark.sql.functions import *

training = training.withColumn("ucd_short", regexp_replace('ucd', '\\d+', ''))


In [5]:
# replace age coding values: 
def replace(column, value):
    return when(column != value, column)

training = training.withColumn("age", replace(col("age"), 999)) # change age value coded as unknown (999) to null

In [6]:
training.filter(training.age == 999).count() # yes - 999 codes have been replaced 

0

In [7]:
training.groupby("ucd_short").count().orderBy('count', ascending=False).show(5) # top 5 causes of death with counts

+---------+------+
|ucd_short| count|
+---------+------+
|        I|674836|
|        C|477413|
|        J|216611|
|        G|159912|
|        F|111510|
+---------+------+
only showing top 5 rows



In [8]:
training.select([count(when(col(c).isNull(), c)).alias(c) for c in training.columns]).show() # rows with null values in each column

+---+---------+---+---+--------------+--------------------+---------+
|ucd|education|sex|age|marital_status|hispanic_race_recode|ucd_short|
+---+---------+---+---+--------------+--------------------+---------+
|  0|    54142|  0|382|         16592|                7871|        0|
+---+---------+---+---+--------------+--------------------+---------+



In [9]:
training.count() # rows in training set

2200579

In [10]:
# drop rows with null or nans
training = training.na.drop()

In [11]:
training.count()

2136639

In [12]:
(1 - (2136639/2200579))*100 # dropped 2.9% of dataset because of null values

2.9055989355528666

#### Clean and parse data

In [13]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml.feature import OneHotEncoderEstimator
from pyspark.ml import Pipeline

In [14]:
# StringIndexer transformers to encode categorical features as numerical for vector assembly.
# Can't make this work with multiple columns but docs say it's possible.  spark 3.0 difference?

indexer1 = StringIndexer(inputCol="marital_status", outputCol="marital_status_index")
indexer2 = StringIndexer(inputCol="sex", outputCol="sex_index")
indexer3 = StringIndexer(inputCol='ucd_short', outputCol='ucd_short_c')

indexed1 = indexer1.fit(training).transform(training)
indexed2 = indexer2.fit(indexed1).transform(indexed1)
indexed = indexer3.fit(indexed2).transform(indexed2)

In [15]:
indexed.show(5)

+----+---------+---+---+--------------+--------------------+---------+--------------------+---------+-----------+
| ucd|education|sex|age|marital_status|hispanic_race_recode|ucd_short|marital_status_index|sex_index|ucd_short_c|
+----+---------+---+---+--------------+--------------------+---------+--------------------+---------+-----------+
|K746|        6|  M| 85|             M|                   6|        K|                 0.0|      0.0|        7.0|
|I249|        6|  M|100|             D|                   6|        I|                 2.0|      0.0|        0.0|
|N321|        3|  F| 66|             M|                   6|        N|                 0.0|      1.0|        8.0|
| W19|        1|  F| 94|             W|                   6|        W|                 1.0|      1.0|       10.0|
| W18|        1|  F| 92|             S|                   4|        W|                 3.0|      1.0|       10.0|
+----+---------+---+---+--------------+--------------------+---------+------------------

In [16]:
# save UCD mappings 
ucd_xref = indexed.select('ucd_short_c', 'ucd_short').distinct()
ucd_xref.sort('ucd_short_c').show(25)

+-----------+---------+
|ucd_short_c|ucd_short|
+-----------+---------+
|        0.0|        I|
|        1.0|        C|
|        2.0|        J|
|        3.0|        G|
|        4.0|        F|
|        5.0|        X|
|        6.0|        E|
|        7.0|        K|
|        8.0|        N|
|        9.0|        A|
|       10.0|        W|
|       11.0|        V|
|       12.0|        R|
|       13.0|        D|
|       14.0|        B|
|       15.0|        Y|
|       16.0|        M|
|       17.0|        P|
|       18.0|        Q|
|       19.0|        L|
|       20.0|        O|
|       21.0|        H|
|       22.0|        U|
+-----------+---------+



In [17]:
# OneHotEncoder transformer to prepare categorical features for model
encoder = OneHotEncoderEstimator(inputCols =["hispanic_race_recode", "education", "marital_status_index"],
                                 outputCols = ["cat_hispanic_race_recode", "cat_education", "cat_marital_status"])


model = encoder.fit(indexed)
encoded = model.transform(indexed)

In [18]:
# VectorAssembler to package the features for model
assembler = VectorAssembler(inputCols=["cat_hispanic_race_recode", "cat_education", "cat_marital_status", "sex_index", "age"],
                            outputCol="features")
output = assembler.transform(encoded)

In [19]:
cleanup = output.select("ucd_short_c", "features")

In [20]:
train_set, holdout_set = cleanup.randomSplit(weights=[0.8, 0.2], seed=212) ## split into train set and validation set

In [21]:
train_set.groupby("ucd_short_c").count().sort('count').show(23)

+-----------+------+
|ucd_short_c| count|
+-----------+------+
|       22.0|     2|
|       21.0|    91|
|       20.0|   698|
|       19.0|  2992|
|       18.0|  5881|
|       17.0|  6413|
|       16.0|  8471|
|       15.0|  8516|
|       14.0| 10063|
|       13.0| 16737|
|       12.0| 20135|
|       11.0| 25557|
|       10.0| 30051|
|        9.0| 33139|
|        8.0| 43393|
|        7.0| 65607|
|        6.0| 76747|
|        5.0| 78917|
|        4.0| 86335|
|        3.0|125333|
|        2.0|167798|
|        1.0|372852|
|        0.0|523784|
+-----------+------+



In [22]:
# there are only 2 observations for group '22' and 91 for group '21' - filter out for now 
train_set = train_set.filter(train_set.ucd_short_c != 22)
train_set = train_set.filter(train_set.ucd_short_c != 21)

In [23]:
train_set.count()

1709419

In [24]:
train_set.groupby("ucd_short_c").count().sort('count').show(21)

+-----------+------+
|ucd_short_c| count|
+-----------+------+
|       20.0|   698|
|       19.0|  2992|
|       18.0|  5881|
|       17.0|  6413|
|       16.0|  8471|
|       15.0|  8516|
|       14.0| 10063|
|       13.0| 16737|
|       12.0| 20135|
|       11.0| 25557|
|       10.0| 30051|
|        9.0| 33139|
|        8.0| 43393|
|        7.0| 65607|
|        6.0| 76747|
|        5.0| 78917|
|        4.0| 86335|
|        3.0|125333|
|        2.0|167798|
|        1.0|372852|
|        0.0|523784|
+-----------+------+



In [25]:
train_set.select('ucd_short_c').distinct().count()

21

There are now 21 different outcomes for UCD. 

#### Balance training data with downsampling

In [26]:
# percentage of observations that are circulatory
class_i = train_set.filter(train_set.ucd_short_c == '0.0').count()
class_i_pct = class_i / (train_set.count())

In [27]:
print('Initial Percent Circulatory: ' + str(class_i_pct*100) + '%')

Initial Percent Circulatory: 30.64105406573813%


In [28]:
# write function to do downsampling

def downSample(df, target, seed):
    
    # gather counts of each class 
    class_counts = df.groupby(target).count()

    # select smallest count size and corresponding class
    smallest_class_size = class_counts.agg({'count': 'min'})
    smallest_class_size = smallest_class_size.collect()[0]['min(count)']

    # generate ratio of each class to smallest class - for use with .sample()
    class_counts = class_counts.withColumn('min', lit(smallest_class_size))
    class_counts = class_counts.withColumn('ratio', class_counts['min']/ class_counts['count'])

    smallest_class = class_counts.filter(class_counts['count'] == class_counts['min']).collect()[0][target]
    
    # set up final dataframe to hold results - with only the smallest class to start
    adjusted_df = df.filter(df[target] == smallest_class)

    # iterate over outcome classes, sampling to match count of smallest class
    for i in range(class_counts.count()):

        outcome_class = class_counts.collect()[i][target]
        ratio = class_counts.collect()[i]['ratio']

        if outcome_class != smallest_class: 

            subset = df.filter(df[target] == outcome_class)
            subset_adjusted = subset.sample(False, ratio, seed = seed)

            adjusted_df = adjusted_df.unionAll(subset_adjusted)
            
        else:
            adjusted_df = adjusted_df

    return adjusted_df

In [29]:
adj_train = downSample(df = train_set, target = 'ucd_short_c', seed = 4)
adj_train.groupby("ucd_short_c").count().sort('count').show(21)

+-----------+-----+
|ucd_short_c|count|
+-----------+-----+
|        0.0|  661|
|       14.0|  679|
|        1.0|  682|
|       18.0|  686|
|       16.0|  697|
|        5.0|  697|
|       20.0|  698|
|        8.0|  700|
|       15.0|  701|
|       19.0|  704|
|       17.0|  705|
|        9.0|  708|
|       12.0|  712|
|        4.0|  713|
|        7.0|  717|
|        3.0|  718|
|       10.0|  720|
|        6.0|  722|
|       13.0|  723|
|        2.0|  730|
|       11.0|  740|
+-----------+-----+



In [30]:
# adjusted percentage of observations that are circulatory
class_i = adj_train.filter(adj_train.ucd_short_c == '0.0').count()
class_i_pct = class_i / (adj_train.count())

In [31]:
print('Adjusted Percent Circulatory: ' + str(class_i_pct*100) + '%')

Adjusted Percent Circulatory: 4.462296631337339%


### Modeling

https://spark.apache.org/docs/2.4.7/ml-classification-regression.html

#### Decision Tree Model

In [32]:
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(labelCol="ucd_short_c", featuresCol="features")

In [33]:
dt_model = dt.fit(adj_train)

In [34]:
dt_train_output = dt_model.transform(adj_train)

In [35]:
# Model summary

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="ucd_short_c", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(dt_train_output)

In [36]:
# dt_train_output.groupby('ucd_short_c').count().show(25)

In [37]:
# dt_train_output.groupby('prediction').count().show(25)

In [38]:
print('DT Accuracy: ' + str(accuracy*100) + '%')
# print('DT Test Error: ' + str((1.0 - accuracy)*100) + '%')

DT Accuracy: 19.570647404307028%


In [39]:
print(dt_model)

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_e1e941a52b26) of depth 5 with 39 nodes


In [40]:
# visualize

# # save model
# dt_model.save('home/ds5559/final_project/models/dt_model')
# modeldf = spark.read.parquet('home/ds5559/final_project/models/dt_model/'+"data/*")

In [41]:
# # visualize decision tree
# noderows = modeldf.select("id","prediction","leftChild","rightChild","split").collect()
# df = pd.Dataframe([[rw['id'],rw['gain],rw['impurity'],rw['gini']] for rw in noderows if rw['leftChild'] < 0 and rw['rightChild'] < 0])
# df.show()

In [42]:
dt_predictions = dt_model.transform(holdout_set)

In [43]:
# dt_predictions.groupby("prediction").count().sort('count').show()

In [44]:
dt_pred_accuracy = evaluator.evaluate(dt_predictions)

In [45]:
print('DT Hold-out Set Prediction Accuracy: ' + str(dt_pred_accuracy*100) + '%')
# print('DT Hold-out Set Test Error: ' + str((1.0 - dt_pred_accuracy)*100) + '%')

DT Hold-out Set Prediction Accuracy: 12.832951323611011%


#### Random Forest Model

In [46]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol='ucd_short_c', featuresCol="features", numTrees=100)

In [47]:
rf_model = rf.fit(adj_train)

In [48]:
rf_train_output = rf_model.transform(adj_train)

In [49]:
# Model summary

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="ucd_short_c", predictionCol="prediction", metricName="accuracy")

rf_accuracy = evaluator.evaluate(rf_train_output)

In [50]:
print('RF Accuracy: ' + str(rf_accuracy*100) + '%')
# print('RF Test Error: ' + str((1.0 - rf_accuracy)*100) + '%')

RF Accuracy: 20.31998919867684%


In [51]:
print(rf_model)

RandomForestClassificationModel (uid=RandomForestClassifier_69cf05bd1001) with 100 trees


In [52]:
rf_predictions = rf_model.transform(holdout_set)

In [53]:
# rf_predictions.groupby("prediction").count().sort('count').show()

In [54]:
rf_pred_accuracy = evaluator.evaluate(rf_predictions)

In [55]:
print('RF Hold-out Set Prediction Accuracy: ' + str(rf_pred_accuracy*100) + '%')
# print('RF Hold-out Set Test Error: ' + str((1.0 - rf_pred_accuracy)*100) + '%')

RF Hold-out Set Prediction Accuracy: 13.406551213105237%


#### Naive Bayes Model

In [56]:
from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes(labelCol='ucd_short_c', featuresCol="features", smoothing=1.0, modelType="multinomial")

In [57]:
nb_model = nb.fit(adj_train)

In [58]:
nb_train_output = nb_model.transform(adj_train)

In [59]:
# Model summary

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="ucd_short_c", predictionCol="prediction", metricName="accuracy")

nb_accuracy = evaluator.evaluate(nb_train_output)

In [60]:
print('NB Accuracy: ' + str(nb_accuracy*100) + '%')
# print('RF Test Error: ' + str((1.0 - rf_accuracy)*100) + '%')

NB Accuracy: 16.809559170998448%


In [61]:
print(nb_model)

NaiveBayes_d1461db29d2d


In [62]:
nb_predictions = nb_model.transform(holdout_set)

In [63]:
# rf_predictions.groupby("prediction").count().sort('count').show()

In [64]:
nb_pred_accuracy = evaluator.evaluate(nb_predictions)

In [65]:
print('NB Validation Hold-out Set Prediction Accuracy: ' + str(nb_pred_accuracy*100) + '%')
# print('RF Hold-out Set Test Error: ' + str((1.0 - rf_pred_accuracy)*100) + '%')

NB Validation Hold-out Set Prediction Accuracy: 11.735853739051851%


#### Logistic Regression Model

In [66]:
from pyspark.ml.classification import LogisticRegression

mlr = LogisticRegression(labelCol='ucd_short_c', featuresCol="features", family = "multinomial")

In [67]:
mlr_model = mlr.fit(adj_train)

In [68]:
# print(str(mlr_model.coefficientMatrix))

In [69]:
mlr_train_output = mlr_model.transform(adj_train)

In [70]:
# Model summary

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="ucd_short_c", predictionCol="prediction", metricName="accuracy")

mlr_accuracy = evaluator.evaluate(mlr_train_output)

In [71]:
print('MLR Accuracy: ' + str(mlr_accuracy*100) + '%')
# print('MLR Test Error: ' + str((1.0 - mlr_accuracy)*100) + '%')

MLR Accuracy: 19.860932964288125%


In [72]:
print(mlr_model)

LogisticRegressionModel: uid = LogisticRegression_c7cbb7f25e66, numClasses = 23, numFeatures = 30


In [73]:
mlr_predictions = mlr_model.transform(holdout_set)

In [74]:
# mlr_predictions.groupby("prediction").count().sort('count').show()

In [75]:
mlr_pred_accuracy = evaluator.evaluate(mlr_predictions)

In [76]:
print('MLR Hold-out Set Prediction Accuracy: ' + str(mlr_pred_accuracy*100) + '%')
# print('MLR Hold-out Set Test Error: ' + str((1.0 - mlr_pred_accuracy)*100) + '%')

MLR Hold-out Set Prediction Accuracy: 11.235534161970563%


### Final Hold-Out Test Set 

In [77]:
# Read in test data
test_data = spark.read.option("header",True).option("inferSchema",True).csv("test_data")
test_data.count()

550650

In [78]:
# subset with potential features
testing = test_data.select("ucd", "education", "sex", "age", "marital_status", "hispanic_race_recode")

In [79]:
# strip UCD codes to include first letter only
testing = testing.withColumn("ucd_short", regexp_replace('ucd', '\\d+', ''))

In [80]:
testing.show(5)

+----+---------+---+---+--------------+--------------------+---------+
| ucd|education|sex|age|marital_status|hispanic_race_recode|ucd_short|
+----+---------+---+---+--------------+--------------------+---------+
| C61|        1|  M| 82|             M|                   7|        C|
|E142|        1|  F| 74|             M|                   1|        E|
|I429|        2|  M| 70|             S|                   8|        I|
|C349|        3|  M| 89|             W|                   6|        C|
|I119|        3|  M| 67|             S|                   7|        I|
+----+---------+---+---+--------------+--------------------+---------+
only showing top 5 rows



In [81]:
# replace age coding values: 
testing = testing.withColumn("age", replace(col("age"), 999)) # change age value coded as unknown (999) to null

In [82]:
# drop rows with null or nans
testing = testing.na.drop()

In [83]:
testing.count()

534741

#### Clean and parse data

In [84]:
# join with ucd_short_c cross reference - want to ensure that string indexing matches between train and test data!
testing = testing.join(ucd_xref, on = 'ucd_short')

In [85]:
testing.show(5)

+---------+----+---------+---+---+--------------+--------------------+-----------+
|ucd_short| ucd|education|sex|age|marital_status|hispanic_race_recode|ucd_short_c|
+---------+----+---------+---+---+--------------+--------------------+-----------+
|        K|K922|        3|  F| 94|             W|                   6|        7.0|
|        K|K746|        2|  M| 72|             M|                   6|        7.0|
|        K|K650|        3|  M| 86|             W|                   6|        7.0|
|        K|K922|        3|  F| 81|             W|                   6|        7.0|
|        K|K746|        3|  F| 72|             M|                   6|        7.0|
+---------+----+---------+---+---+--------------+--------------------+-----------+
only showing top 5 rows



In [86]:
# StringIndexer transformers to encode categorical features as numerical for vector assembly.
# leaving off the ucd indexer here - already joined with ucd_xref
indexed1 = indexer1.fit(testing).transform(testing)
indexed = indexer2.fit(indexed1).transform(indexed1)

In [87]:
indexed.show(2)

+---------+----+---------+---+---+--------------+--------------------+-----------+--------------------+---------+
|ucd_short| ucd|education|sex|age|marital_status|hispanic_race_recode|ucd_short_c|marital_status_index|sex_index|
+---------+----+---------+---+---+--------------+--------------------+-----------+--------------------+---------+
|        K|K922|        3|  F| 94|             W|                   6|        7.0|                 1.0|      1.0|
|        K|K746|        2|  M| 72|             M|                   6|        7.0|                 0.0|      0.0|
+---------+----+---------+---+---+--------------+--------------------+-----------+--------------------+---------+
only showing top 2 rows



In [88]:
# OneHotEncoder transformer to prepare categorical features for model
model = encoder.fit(indexed)
encoded = model.transform(indexed)

In [89]:
# VectorAssembler to package the features for model
assembler = VectorAssembler(inputCols=["cat_hispanic_race_recode", "cat_education", "cat_marital_status", "sex_index", "age"],
                            outputCol="features")
output = assembler.transform(encoded)

In [90]:
cleanup = output.select("ucd_short_c", "features")

In [91]:
cleanup.groupby("ucd_short_c").count().sort('count').show(5)

+-----------+-----+
|ucd_short_c|count|
+-----------+-----+
|       21.0|   21|
|       20.0|  209|
|       19.0|  951|
|       18.0| 1745|
|       17.0| 1990|
+-----------+-----+
only showing top 5 rows



In [92]:
# filter out groups '20' and '21' (consistent with training data pre-processing)
cleanup = cleanup.filter(cleanup.ucd_short_c != 22)
cleanup = cleanup.filter(cleanup.ucd_short_c != 21)

### Modeling - Holdout Set

#### Decision Tree

In [93]:
dt_predictions_f = dt_model.transform(cleanup)

In [94]:
dt_pred_accuracy_f = evaluator.evaluate(dt_predictions_f)

In [95]:
print('DT Final Hold-out Set Prediction Accuracy: ' + str(dt_pred_accuracy_f*100) + '%')
# print('DT Hold-out Set Test Error: ' + str((1.0 - dt_pred_accuracy)*100) + '%')

DT Final Hold-out Set Prediction Accuracy: 12.8038973668462%


#### Random Forest

In [96]:
rf_predictions_f = rf_model.transform(cleanup)

In [97]:
rf_pred_accuracy_f = evaluator.evaluate(rf_predictions_f)

In [98]:
print('RF Final Hold-out Set Prediction Accuracy: ' + str(rf_pred_accuracy_f*100) + '%')
# print('DT Hold-out Set Test Error: ' + str((1.0 - dt_pred_accuracy)*100) + '%')

RF Final Hold-out Set Prediction Accuracy: 13.530632854578098%


#### Naive Bayes

In [99]:
nb_predictions_f = nb_model.transform(cleanup)

In [100]:
nb_pred_accuracy_f = evaluator.evaluate(nb_predictions_f)

In [101]:
print('NB Validation Hold-out Set Prediction Accuracy: ' + str(nb_pred_accuracy_f*100) + '%')
# print('MLR Hold-out Set Test Error: ' + str((1.0 - mlr_pred_accuracy)*100) + '%')

NB Validation Hold-out Set Prediction Accuracy: 11.839654398563734%


#### Multinomial Logistic Regression

In [102]:
mlr_predictions_f = mlr_model.transform(cleanup)

In [103]:
mlr_pred_accuracy_f = evaluator.evaluate(mlr_predictions_f)

In [104]:
print('MLR Validation Hold-out Set Prediction Accuracy: ' + str(mlr_pred_accuracy_f*100) + '%')
# print('MLR Hold-out Set Test Error: ' + str((1.0 - mlr_pred_accuracy)*100) + '%')

MLR Validation Hold-out Set Prediction Accuracy: 11.355662776780372%


### Comparison

Multinomial Linear Regression 

In [105]:
mlr_summary = mlr_model.summary

In [106]:
accuracy = mlr_summary.accuracy
falsePositiveRate = mlr_summary.weightedFalsePositiveRate
truePositiveRate = mlr_summary.weightedTruePositiveRate
fMeasure = mlr_summary.weightedFMeasure()
precision = mlr_summary.weightedPrecision
recall = mlr_summary.weightedRecall

In [107]:
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

Accuracy: 0.19860932964288125
FPR: 0.040155544409270445
TPR: 0.19860932964288122
F-measure: 0.15711806130007133
Precision: 0.15779374904089258
Recall: 0.19860932964288122


In [108]:
# Save notebook as PDF document
!jupyter nbconvert --to pdf `pwd`/*.ipynb

[NbConvertApp] Converting notebook /sfs/qumulo/qhome/thd6tp/ds5559/final_project/Classification_Models.ipynb to pdf
[NbConvertApp] Writing 85847 bytes to ./notebook.tex
[NbConvertApp] Building PDF
[NbConvertApp] Running xelatex 3 times: ['xelatex', './notebook.tex', '-quiet']
[NbConvertApp] Running bibtex 1 time: ['bibtex', './notebook']
[NbConvertApp] PDF successfully created
[NbConvertApp] Writing 74703 bytes to /sfs/qumulo/qhome/thd6tp/ds5559/final_project/Classification_Models.pdf
[NbConvertApp] Converting notebook /sfs/qumulo/qhome/thd6tp/ds5559/final_project/Classification_Models_DS.ipynb to pdf
[NbConvertApp] Writing 98157 bytes to ./notebook.tex
[NbConvertApp] Building PDF
[NbConvertApp] Running xelatex 3 times: ['xelatex', './notebook.tex', '-quiet']
[NbConvertApp] Running bibtex 1 time: ['bibtex', './notebook']
[NbConvertApp] PDF successfully created
[NbConvertApp] Writing 83894 bytes to /sfs/qumulo/qhome/thd6tp/ds5559/final_project/Classification_Models_DS.pdf
[NbConvertApp]