In [15]:
import numpy as np
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import StringIndexer
from matplotlib import pyplot as plt

In [16]:
## Reading Data
df = spark.read.load('abfss://fixme@fixmeadlsstorage.dfs.core.windows.net/covid_data.csv', format='csv',header='true')

In [17]:
df.describe().toPandas()

In [18]:
string_indexer = StringIndexer(inputCols=['location','country', 'gender', 'symptom1', 'symptom2', 'symptom3', 'symptom4','symptom5','symptom6'], 
outputCols=['location_indexed','country_indexed', 'gender_indexed', 'symptom1_indexed', 'symptom2_indexed', 'symptom3_indexed', 'symptom4_indexed','symptom5_indexed','symptom6_indexed'])
df = string_indexer.fit(df).transform(df)
df.columns

In [19]:
diff_sym_hos = unix_timestamp(col("hosp_vis"),"M/d/yyyy")-unix_timestamp(col("sym_on"),"M/d/yyyy")
df = df.withColumn('diff_sym_hos', diff_sym_hos)

In [20]:
df.printSchema()

In [21]:
df = df.withColumn("age", col("age").cast("Integer"))
df = df.withColumn("vis_wuhan", col("vis_wuhan").cast("Integer"))
df = df.withColumn("from_wuhan", col("from_wuhan").cast("Integer"))
df = df.withColumn("death", col("death").cast("Integer"))
df = df.fillna(value=50, subset=['age'])
df.printSchema()

In [22]:
df = df.fillna(np.nan)
df.limit(10)

In [23]:
df = df.drop('id', 'location', 'gender', 'country', 'sym_on', 'hosp_vis', 'symptom1', 'symptom2', 'symptom3', 'symptom4', 'symptom5', 'symptom6')

In [24]:
input_features = ['location_indexed','country_indexed','gender_indexed','age','vis_wuhan','from_wuhan','symptom1_indexed','symptom2_indexed','symptom3_indexed','symptom4_indexed','symptom5_indexed','symptom6_indexed','diff_sym_hos']


In [26]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from sklearn import metrics

In [25]:
df = df.toPandas()
input_cols = df[input_features]
label = df.death
X_train, X_test, y_train, y_test = train_test_split(input_cols, label, test_size=0.2, random_state=10)
#train_data,test_data = transformed_data.randomSplit([0.8,0.2])
dt = DecisionTreeClassifier(max_depth=2).fit(X_train, y_train)


In [13]:
plot_tree(dt, filled=True)
plt.title("Decision tree trained on all the Covid Data")
plt.show()
# plt.savefig() GIVE THE PATH

In [27]:
y_pred = dt.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
f1 = metrics.f1_score(y_test, y_pred)
print("Accuracy:",accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

In [29]:
plt.bar(['Accuracy','F1 Score','Recall Score','Precision Score'],[accuracy,f1,recall,precision],color=['red','green','blue','yellow'])
plt.plot([accuracy,f1,recall,precision],color='black')
plt.title('Evaluation Metrics for Decision Tree')

In [14]:
dt_preds = dt_model.transform(test_data)
accuracy_eval = MulticlassClassificationEvaluator(predictionCol='prediction',metricName='accuracy', labelCol='death')
precision_eval = MulticlassClassificationEvaluator(predictionCol='prediction',metricName='weightedPrecision', labelCol='death')
recall_eval = MulticlassClassificationEvaluator(predictionCol='prediction',metricName='weightedRecall', labelCol='death')
f1_eval = MulticlassClassificationEvaluator(predictionCol='prediction',metricName='f1', labelCol='death')
accuracy = accuracy_eval.evaluate(dt_preds)
precision = precision_eval.evaluate(dt_preds)
recall = recall_eval.evaluate(dt_preds)
f1 = f1_eval.evaluate(dt_preds)
print('accuracy:')
print(accuracy)
print('precision:')
print(precision)
print('recall:')
print(recall)
print('f1:')
print(f1)