## Supervised Machine Learning on DonorChoose.org Data

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pandas as pd

spark = SparkSession.builder.appName('ReadData').getOrCreate()
sc = spark.sparkContext
from pyspark.sql import HiveContext
hive_context = HiveContext(sc)

In [3]:
df = hive_context.table("donor.final_projects_joined")

In [4]:
df.show(2)

+--------------------+--------------------+--------------------+-----------+------------+------------------+----------------+------------+-----------------+----------------+--------------+----------------------+-------------------+-------------------+--------------------+--------------------+-----------------+-----------------+----------------------------+------------+----------+-----------+-------------+--------------------+-------------------+-------------------+--------------------------+--------------------+------------------------+--------------+---------------------------------+-------------+------------------+-----------------+---------------------+
|          project_id|           school_id|          teacher_id|project_seq|project_type|project_grad_level|project_resource|project_cost|project_post_date|project_exp_date|project_status|project_full_fund_date|       project_cat1|       project_cat2|     project_subcat1|     project_subcat2|      school_name|school_metro_type|school

In [4]:
df = df.drop('project_seq','project_id','school_id','teacher_id','project_post_date','project_exp_date',
                                       'project_full_fund_date','school_name','school_city','school_district',
                                       'school_county','school_zip','donation_amount_sum','number_of_donations',
                                       'teacher_first_project_posted_date','project_cat2','project_subcat2')

In [5]:
df = df.withColumn('month_of_post_date',df.month_of_post_date.cast('string'))
df = df.withColumn('year_of_post_date',df.year_of_post_date.cast('string'))

In [6]:
df.dtypes

[('project_type', 'string'),
 ('project_grad_level', 'string'),
 ('project_resource', 'string'),
 ('project_cost', 'float'),
 ('project_status', 'string'),
 ('project_cat1', 'string'),
 ('project_subcat1', 'string'),
 ('school_metro_type', 'string'),
 ('school_percentage_free_lunch', 'float'),
 ('school_state', 'string'),
 ('number_of_resources_needed', 'bigint'),
 ('resources_amount_sum', 'double'),
 ('resources_total_quantity', 'double'),
 ('teacher_prefix', 'string'),
 ('days_diff_exp', 'int'),
 ('month_of_post_date', 'string'),
 ('year_of_post_date', 'string'),
 ('teacher_date_diff_exp', 'int')]

In [7]:
null_count = df.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in df.columns)).toPandas()
null_count.T

Unnamed: 0,0
project_type,0
project_grad_level,0
project_resource,0
project_cost,0
project_status,0
project_cat1,0
project_subcat1,0
school_metro_type,0
school_percentage_free_lunch,0
school_state,0


#### Categorical Features

In [8]:
cat_features = [t[0] for t in df.dtypes if t[1] == 'string']

In [9]:
cat_features = set(cat_features) - set(['project_status'])
cat_features = list(cat_features)

In [10]:
cat_features

['teacher_prefix',
 'project_resource',
 'project_cat1',
 'school_state',
 'project_grad_level',
 'year_of_post_date',
 'project_type',
 'school_metro_type',
 'project_subcat1',
 'month_of_post_date']

In [11]:
for i in cat_features:
    print(df.groupBy(i).count().show())

+--------------+------+
|teacher_prefix| count|
+--------------+------+
|           Ms.|383778|
|       Teacher| 14160|
|           Mr.|115610|
|          Mrs.|544075|
|           Dr.|   270|
|           Mx.|    59|
+--------------+------+

None
+--------------------+------+
|    project_resource| count|
+--------------------+------+
|Reading Nooks Des...| 10448|
|Sports & Exercise...|  5558|
| Computers & Tablets| 22892|
|Food Clothing & H...|  4817|
|            Supplies|382818|
|    Classroom Basics|  9386|
|               Other| 66515|
|        Art Supplies|  7013|
|               Books|177502|
|Educational Kits ...| 17431|
|Instructional Tec...| 14936|
|       Lab Equipment|  5833|
|               Trips| 19136|
|            Visitors|  2756|
|          Technology|296747|
|    Flexible Seating| 10839|
| Musical Instruments|  3325|
+--------------------+------+

None
+-------------------+------+
|       project_cat1| count|
+-------------------+------+
|   History & Civics| 49846|
| 

In [12]:
#replace unknown in df
replace_str = udf(lambda x: x.replace('unknown','Grades PreK-2'))
df = df.withColumn('project_grad_level',replace_str('project_grad_level'))

In [13]:
#replace unknown in df
replace_str = udf(lambda x: x.replace('unknown','suburban'))
df = df.withColumn('school_metro_type',replace_str('school_metro_type'))

In [14]:
df.groupBy('school_metro_type').count().show()

+-----------------+------+
|school_metro_type| count|
+-----------------+------+
|         suburban|408813|
|             town| 48942|
|            rural|106846|
|            urban|493351|
+-----------------+------+



In [36]:
df.groupBy('project_grad_level').count().show()

+------------------+------+
|project_grad_level| count|
+------------------+------+
|        Grades 6-8|173343|
|        Grades 3-5|347275|
|       Grades 9-12|126982|
|     Grades PreK-2|410352|
+------------------+------+



#### Numeric Columns

In [15]:
numeric_features = set(df.columns) - set(cat_features) - set(['project_status'])
numeric_features = list(numeric_features)
numeric_features

['resources_total_quantity',
 'resources_amount_sum',
 'days_diff_exp',
 'number_of_resources_needed',
 'teacher_date_diff_exp',
 'project_cost',
 'school_percentage_free_lunch']

In [16]:
numeric_features

['resources_total_quantity',
 'resources_amount_sum',
 'days_diff_exp',
 'number_of_resources_needed',
 'teacher_date_diff_exp',
 'project_cost',
 'school_percentage_free_lunch']

#### One Hot Encoding Variables

In [17]:
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler

categoricalColumns = cat_features
stages = []
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index')
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]
label_stringIdx = StringIndexer(inputCol = 'project_status', outputCol = 'label')
stages += [label_stringIdx]
numericCols = numeric_features
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [18]:
cols = cat_features + numeric_features + ['project_status']

In [19]:
cols

['teacher_prefix',
 'project_resource',
 'project_cat1',
 'school_state',
 'project_grad_level',
 'year_of_post_date',
 'project_type',
 'school_metro_type',
 'project_subcat1',
 'month_of_post_date',
 'resources_total_quantity',
 'resources_amount_sum',
 'days_diff_exp',
 'number_of_resources_needed',
 'teacher_date_diff_exp',
 'project_cost',
 'school_percentage_free_lunch',
 'project_status']

In [20]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages = stages)
pipelineModel = pipeline.fit(df)
ml_df = pipelineModel.transform(df)
selectedCols = ['label', 'features'] + cols
ml_df = ml_df.select(selectedCols)
ml_df.printSchema()

root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- teacher_prefix: string (nullable = true)
 |-- project_resource: string (nullable = true)
 |-- project_cat1: string (nullable = true)
 |-- school_state: string (nullable = true)
 |-- project_grad_level: string (nullable = true)
 |-- year_of_post_date: string (nullable = true)
 |-- project_type: string (nullable = true)
 |-- school_metro_type: string (nullable = true)
 |-- project_subcat1: string (nullable = true)
 |-- month_of_post_date: string (nullable = true)
 |-- resources_total_quantity: double (nullable = true)
 |-- resources_amount_sum: double (nullable = true)
 |-- days_diff_exp: integer (nullable = true)
 |-- number_of_resources_needed: long (nullable = true)
 |-- teacher_date_diff_exp: integer (nullable = true)
 |-- project_cost: float (nullable = true)
 |-- school_percentage_free_lunch: float (nullable = true)
 |-- project_status: string (nullable = true)



In [21]:
ml_df.show(20)

+-----+--------------------+--------------+--------------------+-------------------+------------+------------------+-----------------+------------+-----------------+--------------------+------------------+------------------------+--------------------+-------------+--------------------------+---------------------+------------+----------------------------+--------------+
|label|            features|teacher_prefix|    project_resource|       project_cat1|school_state|project_grad_level|year_of_post_date|project_type|school_metro_type|     project_subcat1|month_of_post_date|resources_total_quantity|resources_amount_sum|days_diff_exp|number_of_resources_needed|teacher_date_diff_exp|project_cost|school_percentage_free_lunch|project_status|
+-----+--------------------+--------------+--------------------+-------------------+------------+------------------+-----------------+------------+-----------------+--------------------+------------------+------------------------+--------------------+-----

In [22]:
ml_df.select('features').take(5)

[Row(features=SparseVector(136, {0: 1.0, 7: 1.0, 21: 1.0, 29: 1.0, 80: 1.0, 85: 1.0, 89: 1.0, 91: 1.0, 123: 1.0, 129: 72.0, 130: 202.47, 131: 120.0, 132: 25.0, 133: 436.0, 134: 698.15, 135: 80.0})),
 Row(features=SparseVector(136, {0: 1.0, 7: 1.0, 21: 1.0, 29: 1.0, 80: 1.0, 85: 1.0, 89: 1.0, 91: 1.0, 122: 1.0, 129: 30.0, 130: 43.49, 131: 129.0, 132: 6.0, 133: 129.0, 134: 294.95, 135: 80.0})),
 Row(features=SparseVector(136, {0: 1.0, 14: 1.0, 21: 1.0, 29: 1.0, 85: 1.0, 89: 1.0, 91: 1.0, 121: 1.0, 129: 2.0, 130: 314.99, 131: 120.0, 132: 2.0, 133: 532.0, 134: 411.42, 135: 80.0})),
 Row(features=SparseVector(136, {0: 1.0, 7: 1.0, 21: 1.0, 29: 1.0, 80: 1.0, 85: 1.0, 89: 1.0, 91: 1.0, 126: 1.0, 129: 50.0, 130: 70.3, 131: 122.0, 132: 7.0, 133: 317.0, 134: 424.45, 135: 80.0})),
 Row(features=SparseVector(136, {2: 1.0, 6: 1.0, 25: 1.0, 30: 1.0, 79: 1.0, 84: 1.0, 85: 1.0, 87: 1.0, 105: 1.0, 125: 1.0, 129: 24.0, 130: 91.7, 131: 116.0, 132: 4.0, 133: 116.0, 134: 744.53, 135: 94.0}))]

In [23]:
train, test = ml_df.randomSplit([0.7, 0.3],seed=2019)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

Training Dataset Count: 740806
Test Dataset Count: 317146


In [27]:
train = train['features','label']

### Logistic Regression

In [24]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)
lrModel = lr.fit(train)
lr_predictions = lrModel.transform(test)

In [25]:
lr_predictions.show(5)

+-----+--------------------+--------------+----------------+-------------------+------------+------------------+-----------------+------------+-----------------+--------------------+------------------+------------------------+--------------------+-------------+--------------------------+---------------------+------------+----------------------------+--------------+--------------------+--------------------+----------+
|label|            features|teacher_prefix|project_resource|       project_cat1|school_state|project_grad_level|year_of_post_date|project_type|school_metro_type|     project_subcat1|month_of_post_date|resources_total_quantity|resources_amount_sum|days_diff_exp|number_of_resources_needed|teacher_date_diff_exp|project_cost|school_percentage_free_lunch|project_status|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------+----------------+-------------------+------------+------------------+-----------------+------------+---------------

In [26]:
#print evaluation metrics
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(lr_predictions, {evaluator.metricName: "areaUnderROC"})))

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

print('Accuracy:', evaluator.evaluate(lr_predictions, {evaluator.metricName: "accuracy"}))
print('F1 Score:', evaluator.evaluate(lr_predictions, {evaluator.metricName: "f1"}))

Test Area Under ROC: 0.7177818868445346
Accuracy: 0.7772949997792815
F1 Score: 0.7031607668747024


In [None]:
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

In [24]:
import matplotlib.pyplot as plt
import numpy as np

trainingSummary = lrModel.summary
roc = trainingSummary.roc.toPandas()
plt.plot(roc['FPR'],roc['TPR'])
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
print('Training set areaUnderROC: ' + str(trainingSummary.areaUnderROC))

<Figure size 640x480 with 1 Axes>

Training set areaUnderROC: 0.7198936355732545


### Random Forest

In [27]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label',numTrees=100)
rfModel = rf.fit(train)
rf_predictions = rfModel.transform(test)
rf_predictions.select('label', 'rawPrediction', 'prediction', 'probability').show(10)

+-----+--------------------+----------+--------------------+
|label|       rawPrediction|prediction|         probability|
+-----+--------------------+----------+--------------------+
|  0.0|[77.3865740329326...|       0.0|[0.77386574032932...|
|  0.0|[78.9130997586067...|       0.0|[0.78913099758606...|
|  0.0|[79.3125358978453...|       0.0|[0.79312535897845...|
|  0.0|[78.9784264062311...|       0.0|[0.78978426406231...|
|  0.0|[77.9364753205222...|       0.0|[0.77936475320522...|
|  0.0|[79.2898321952912...|       0.0|[0.79289832195291...|
|  0.0|[76.0745443922988...|       0.0|[0.76074544392298...|
|  0.0|[79.1025776378987...|       0.0|[0.79102577637898...|
|  0.0|[75.0837610474281...|       0.0|[0.75083761047428...|
|  0.0|[76.1427046978135...|       0.0|[0.76142704697813...|
+-----+--------------------+----------+--------------------+
only showing top 10 rows



In [28]:
#print evaluation metrics
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(rf_predictions, {evaluator.metricName: "areaUnderROC"})))

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

print('Accuracy:', evaluator.evaluate(rf_predictions, {evaluator.metricName: "accuracy"}))
print('F1 Score:', evaluator.evaluate(rf_predictions, {evaluator.metricName: "f1"}))

Test Area Under ROC: 0.7001633551820505
Accuracy: 0.7736310721245104
F1 Score: 0.6748923664712432


In [29]:
rfModel.featureImportances

SparseVector(136, {0: 0.0008, 1: 0.0124, 2: 0.0, 3: 0.0001, 5: 0.0023, 6: 0.1096, 7: 0.0481, 8: 0.0, 9: 0.0449, 10: 0.0001, 11: 0.0366, 12: 0.028, 13: 0.0063, 14: 0.033, 15: 0.0226, 16: 0.0158, 18: 0.0001, 19: 0.0032, 21: 0.0002, 22: 0.0001, 23: 0.0, 24: 0.0001, 25: 0.0001, 26: 0.0, 30: 0.0013, 31: 0.0001, 32: 0.0, 33: 0.0003, 34: 0.0008, 35: 0.0044, 36: 0.0001, 37: 0.0, 38: 0.0031, 39: 0.0002, 40: 0.0002, 41: 0.0001, 42: 0.0009, 43: 0.0, 45: 0.0, 47: 0.009, 48: 0.0, 50: 0.0003, 51: 0.0003, 52: 0.0, 53: 0.0, 55: 0.0006, 56: 0.0, 58: 0.0003, 59: 0.0002, 60: 0.0, 61: 0.0, 62: 0.0, 63: 0.0, 65: 0.0, 68: 0.0, 71: 0.0, 72: 0.0, 74: 0.0001, 77: 0.0011, 78: 0.0001, 79: 0.0, 80: 0.0061, 81: 0.0401, 82: 0.0011, 83: 0.0017, 84: 0.0126, 85: 0.0009, 86: 0.0007, 87: 0.0436, 88: 0.0001, 89: 0.0042, 90: 0.0003, 91: 0.0004, 92: 0.0, 94: 0.0001, 96: 0.0, 97: 0.0001, 98: 0.0, 99: 0.0, 101: 0.0001, 102: 0.0, 103: 0.0, 104: 0.0014, 107: 0.0292, 108: 0.0002, 109: 0.0001, 110: 0.0001, 111: 0.0001, 112: 0.00

In [30]:
def ExtractFeatureImp(featureImp, dataset, featuresCol):
    list_extract = []
    for i in dataset.schema[featuresCol].metadata["ml_attr"]["attrs"]:
        list_extract = list_extract + dataset.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
    varlist = pd.DataFrame(list_extract)
    varlist['score'] = varlist['idx'].apply(lambda x: featureImp[x])
    return(varlist.sort_values('score', ascending = False))

In [31]:
ExtractFeatureImp(rfModel.featureImportances, rf_predictions, "features")[0:20]


Unnamed: 0,idx,name,score
5,134,project_cost,0.199699
1,130,resources_amount_sum,0.120264
13,6,project_resourceclassVec_Technology,0.109606
14,7,project_resourceclassVec_Books,0.048106
16,9,project_resourceclassVec_Computers & Tablets,0.044905
94,87,school_metro_typeclassVec_urban,0.043625
3,132,number_of_resources_needed,0.042336
88,81,year_of_post_dateclassVec_2016,0.040117
18,11,project_resourceclassVec_Educational Kits & Games,0.036629
21,14,project_resourceclassVec_Reading Nooks Desks &...,0.033014


### Gradient Boosting

In [32]:
from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(maxIter=10)
gbtModel = gbt.fit(train)
gb_predictions = gbtModel.transform(test)
gb_predictions.select('label', 'rawPrediction', 'prediction', 'probability').show(10)

+-----+--------------------+----------+--------------------+
|label|       rawPrediction|prediction|         probability|
+-----+--------------------+----------+--------------------+
|  0.0|[0.45690849040772...|       0.0|[0.71378059963248...|
|  0.0|[0.45690849040772...|       0.0|[0.71378059963248...|
|  0.0|[0.71370483691703...|       0.0|[0.80649739206479...|
|  0.0|[0.75298273397177...|       0.0|[0.81846251955810...|
|  0.0|[0.53829383018210...|       0.0|[0.74584768675451...|
|  0.0|[0.92995506010174...|       0.0|[0.86528647148212...|
|  0.0|[0.35235271263871...|       0.0|[0.66923019863204...|
|  0.0|[1.16911796354030...|       0.0|[0.91199460275360...|
|  0.0|[0.31731554744659...|       0.0|[0.65353880598453...|
|  0.0|[0.07787865713602...|       0.0|[0.53886079563713...|
+-----+--------------------+----------+--------------------+
only showing top 10 rows



In [33]:
#print evaluation metrics
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(gb_predictions, {evaluator.metricName: "areaUnderROC"})))

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

print('Accuracy:', evaluator.evaluate(gb_predictions, {evaluator.metricName: "accuracy"}))
print('F1 Score:', evaluator.evaluate(gb_predictions, {evaluator.metricName: "f1"}))

Test Area Under ROC: 0.730820596656474
Accuracy: 0.7814791925485423
F1 Score: 0.7254190399704524


In [34]:
ExtractFeatureImp(gbtModel.featureImportances, gb_predictions, "features")[0:20]


Unnamed: 0,idx,name,score
5,134,project_cost,0.330226
87,80,year_of_post_dateclassVec_2017,0.060403
28,21,project_cat1classVec_Literacy & Language,0.059584
94,87,school_metro_typeclassVec_urban,0.059013
88,81,year_of_post_dateclassVec_2016,0.054854
131,124,month_of_post_dateclassVec_2,0.03736
91,84,year_of_post_dateclassVec_2013,0.035636
132,125,month_of_post_dateclassVec_12,0.033945
129,122,month_of_post_dateclassVec_1,0.032958
7,0,teacher_prefixclassVec_Mrs.,0.028112


### Cross Validation with Gradient Boosting

In [35]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")

paramGrid = (ParamGridBuilder()
             .addGrid(gbt.maxIter, [10])
             .build())

cv = CrossValidator(estimator=gbt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
cvModel = cv.fit(train)
cv_predictions= cvModel.transform(test)
evaluator.evaluate(cv_predictions)

0.7254190399704524