In [1]:
import pandas as pd
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
df_symptoms = pd.read_csv('test_data/results_symptoms.csv')

In [4]:
len(df_symptoms)

93995

In [5]:
df_symptoms.columns

Index(['covid19_test_results', 'temperature', 'high_risk_exposure_occupation',
       'high_risk_interactions', 'labored_respiration', 'rhonchi', 'cough',
       'cough_severity', 'fever', 'sob', 'sob_severity', 'diarrhea', 'fatigue',
       'headache', 'loss_of_smell', 'loss_of_taste', 'runny_nose',
       'muscle_sore', 'sore_throat', 'wheezes'],
      dtype='object')

In [6]:
df_symptoms.describe()

Unnamed: 0,covid19_test_results,temperature,high_risk_exposure_occupation,high_risk_interactions,labored_respiration,rhonchi,cough,cough_severity,fever,sob,sob_severity,diarrhea,fatigue,headache,loss_of_smell,loss_of_taste,runny_nose,muscle_sore,sore_throat,wheezes
count,93995.0,93995.0,93995.0,93995.0,93995.0,93995.0,93995.0,93995.0,93995.0,93995.0,93995.0,93995.0,93995.0,93995.0,93995.0,93995.0,93995.0,93995.0,93995.0,93995.0
mean,0.013969,18.610529,0.07377,0.151231,0.004128,0.075398,0.062184,-0.919421,0.021533,0.030236,-0.955381,0.020182,0.065025,0.057673,0.007054,0.007192,0.0368,0.036066,0.062854,0.101442
std,0.117362,18.397395,0.261397,0.358276,0.064116,0.264033,0.241491,0.340937,0.145154,0.171236,0.272407,0.140623,0.246571,0.233126,0.083689,0.0845,0.188271,0.186455,0.242702,0.301914
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,36.1,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,36.8,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,39.6,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_fscore_support as score

In [8]:
df_symptoms.head(5)

Unnamed: 0,covid19_test_results,temperature,high_risk_exposure_occupation,high_risk_interactions,labored_respiration,rhonchi,cough,cough_severity,fever,sob,sob_severity,diarrhea,fatigue,headache,loss_of_smell,loss_of_taste,runny_nose,muscle_sore,sore_throat,wheezes
0,0,37.0,0,0,0,0,0,-1,0,0,-1,0,0,0,0,0,0,0,0,0
1,0,36.75,0,0,0,1,0,-1,0,0,-1,0,0,0,0,0,0,0,0,1
2,0,36.95,0,0,0,0,0,-1,0,0,-1,0,0,0,0,0,0,0,0,0
3,0,36.85,0,0,0,0,0,-1,0,0,-1,0,0,0,0,0,0,0,0,0
4,0,37.0,0,1,0,0,0,-1,0,0,-1,0,1,1,0,0,0,0,0,0


In [9]:
# it can be useful to set a numerical id for each row as a column
df_symptoms["id"] = df_symptoms.index

In [10]:
df_symptoms.head()

Unnamed: 0,covid19_test_results,temperature,high_risk_exposure_occupation,high_risk_interactions,labored_respiration,rhonchi,cough,cough_severity,fever,sob,sob_severity,diarrhea,fatigue,headache,loss_of_smell,loss_of_taste,runny_nose,muscle_sore,sore_throat,wheezes,id
0,0,37.0,0,0,0,0,0,-1,0,0,-1,0,0,0,0,0,0,0,0,0,0
1,0,36.75,0,0,0,1,0,-1,0,0,-1,0,0,0,0,0,0,0,0,1,1
2,0,36.95,0,0,0,0,0,-1,0,0,-1,0,0,0,0,0,0,0,0,0,2
3,0,36.85,0,0,0,0,0,-1,0,0,-1,0,0,0,0,0,0,0,0,0,3
4,0,37.0,0,1,0,0,0,-1,0,0,-1,0,1,1,0,0,0,0,0,0,4


In [11]:
# we don't want to train on our results - or the id! so remove them...
df_symptoms.iloc[:,1:-1].head()

Unnamed: 0,temperature,high_risk_exposure_occupation,high_risk_interactions,labored_respiration,rhonchi,cough,cough_severity,fever,sob,sob_severity,diarrhea,fatigue,headache,loss_of_smell,loss_of_taste,runny_nose,muscle_sore,sore_throat,wheezes
0,37.0,0,0,0,0,0,-1,0,0,-1,0,0,0,0,0,0,0,0,0
1,36.75,0,0,0,1,0,-1,0,0,-1,0,0,0,0,0,0,0,0,1
2,36.95,0,0,0,0,0,-1,0,0,-1,0,0,0,0,0,0,0,0,0
3,36.85,0,0,0,0,0,-1,0,0,-1,0,0,0,0,0,0,0,0,0
4,37.0,0,1,0,0,0,-1,0,0,-1,0,1,1,0,0,0,0,0,0


In [12]:
len(df_symptoms)

93995

In [13]:
#https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
# for random forest, we don't need a scaler
symptom_clf = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', RandomForestClassifier())
    ])

In [14]:
# don't include the ID as a feature
symptom_clf.fit(df_symptoms.iloc[:,2:-1], df_symptoms['covid19_test_results'])

Pipeline(steps=[('scaler', StandardScaler()),
                ('clf', RandomForestClassifier())])

In [15]:
#uncomment if you are using a scalar and would like to view the normalized or scaled variables
symptom_clf['scaler'].transform(df_symptoms.iloc[:,2:-1])

array([[-0.282215  , -0.42211075, -0.06438159, ..., -0.19342997,
        -0.25897887, -0.33599654],
       [-0.282215  , -0.42211075, -0.06438159, ..., -0.19342997,
        -0.25897887,  2.97622109],
       [-0.282215  , -0.42211075, -0.06438159, ..., -0.19342997,
        -0.25897887, -0.33599654],
       ...,
       [-0.282215  , -0.42211075, -0.06438159, ..., -0.19342997,
        -0.25897887, -0.33599654],
       [-0.282215  , -0.42211075, -0.06438159, ..., -0.19342997,
        -0.25897887, -0.33599654],
       [-0.282215  , -0.42211075, -0.06438159, ..., -0.19342997,
        -0.25897887,  2.97622109]])

In [16]:
symptom_clf['clf'].feature_importances_

array([0.06165644, 0.0687793 , 0.01208403, 0.02641018, 0.04357549,
       0.10251428, 0.06936544, 0.04293717, 0.05692173, 0.05019703,
       0.06024449, 0.05496137, 0.08864099, 0.05889061, 0.0567874 ,
       0.05210237, 0.06461555, 0.02931612])

In [17]:
feature_df = pd.DataFrame({'feature':df_symptoms.columns[2:-1], 
                           'importance': symptom_clf['clf'].feature_importances_})

In [18]:
pysqldf("SELECT * FROM feature_df ORDER BY importance DESC")

Unnamed: 0,feature,importance
0,cough_severity,0.102514
1,loss_of_smell,0.088641
2,fever,0.069365
3,high_risk_interactions,0.068779
4,sore_throat,0.064616
5,high_risk_exposure_occupation,0.061656
6,fatigue,0.060244
7,loss_of_taste,0.058891
8,sob_severity,0.056922
9,runny_nose,0.056787


In [19]:
symptom_scores = cross_val_score(symptom_clf, df_symptoms.iloc[:,2:-1], df_symptoms['covid19_test_results'], cv=5)

In [20]:
print("Accuracy: %0.2f, StDev: %0.2f)" % (symptom_scores.mean(), symptom_scores.std()))

Accuracy: 0.98, StDev: 0.00)


In [21]:
test_predicted_proba = symptom_clf.predict_proba(df_symptoms.iloc[:, 2:-1])

In [22]:
test_predicted = symptom_clf.predict(df_symptoms.iloc[:, 2:-1])

In [23]:
df_pred = pd.DataFrame({"p_cat":test_predicted, "p_prob":[item[1] for item in test_predicted_proba], "p_id":df_symptoms.id })

In [24]:
pysqldf("SELECT * FROM df_pred ORDER BY p_prob DESC LIMIT 5")

Unnamed: 0,p_cat,p_prob,p_id
0,1,0.928333,2418
1,1,0.928333,44806
2,1,0.925,69984
3,1,0.925,79124
4,1,0.911,3051


In [25]:
symptom_clf.predict_proba([[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]])

array([[0.83008408, 0.16991592]])

In [26]:
symptom_clf.predict_proba([[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]])

array([[0.67666667, 0.32333333]])

In [27]:
symptom_clf.predict_proba([list(df_symptoms.iloc[123][2:-1])])

array([[9.991e-01, 9.000e-04]])

In [28]:
symptom_clf.predict_proba([[1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]])

array([[0.98519173, 0.01480827]])

In [29]:
y_pred = symptom_clf.predict(df_symptoms.iloc[:, 2:-1])

In [30]:
len(y_pred), y_pred.sum()

(93995, 262)

### Accuracy: 

How often did the predicted value match the actual value

### Precision: 

When you predicted a positive result, how often was the actual value positive?

### Recall: 

When the actual value was positive, how often did you predict a positive value?

### Visual:

https://en.wikipedia.org/wiki/Precision_and_recall

In [31]:
# predict categories
y_pred = symptom_clf.predict(df_symptoms.iloc[:, 2:-1])

In [32]:
df_pred = pd.DataFrame({'pred': y_pred, 'actual': df_symptoms['covid19_test_results']})

In [33]:
pd.set_option('display.max_rows', None)

In [34]:
df_pred.sort_values(['actual'], ascending=[0]).head(10)

Unnamed: 0,pred,actual
57466,0,1
21429,0,1
27907,0,1
49910,0,1
39152,1,1
39151,0,1
74288,0,1
33080,0,1
65389,0,1
78198,0,1


In [35]:
precision, recall, fscore, train_support = score(df_symptoms['covid19_test_results'], y_pred, pos_label=1, average='binary')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==df_symptoms['covid19_test_results']).sum()/len(y_pred), 3)))

Precision: 0.935 / Recall: 0.187 / Accuracy: 0.988


In [36]:
# could we find a better trade-off using probabilities?

In [37]:
y_pred_proba = symptom_clf.predict_proba(df_symptoms.iloc[:, 2:-1])

In [38]:
# find a few with positive scores
df_pred = pd.DataFrame({'pred': [i[1] for i in y_pred_proba], 'actual': df_symptoms['covid19_test_results']})

In [39]:
# look at the current threshold of .50, should we move it?
# look at our strongest predictions
df_pred.sort_values(['pred'], ascending=[0])[0:10]

Unnamed: 0,pred,actual
44806,0.928333,1
2418,0.928333,1
69984,0.925,1
79124,0.925,1
67273,0.911,1
3051,0.911,1
37527,0.902305,1
16415,0.902305,1
43800,0.9,1
56857,0.8975,1


In [40]:
# look at the current threshold of .50, should we move it?
df_pred.sort_values(['pred'], ascending=[0])[0:10]

Unnamed: 0,pred,actual
44806,0.928333,1
2418,0.928333,1
69984,0.925,1
79124,0.925,1
67273,0.911,1
3051,0.911,1
37527,0.902305,1
16415,0.902305,1
43800,0.9,1
56857,0.8975,1


In [41]:
# most values are not predicted to be positive
df_pred.sort_values(['pred'], ascending=[0])[-10:]

Unnamed: 0,pred,actual
4508,0.0,0
62916,0.0,0
33682,0.0,0
23324,0.0,0
45017,0.0,0
74216,0.0,0
8161,0.0,0
66673,0.0,0
86843,0.0,0
5545,0.0,0


In [42]:
# a few in the middle
df_pred.sort_values(['pred'], ascending=[0])[250:260]

Unnamed: 0,pred,actual
697,0.519061,0
79955,0.518823,1
7604,0.518823,0
20535,0.515333,0
70224,0.515333,1
53875,0.513693,1
72055,0.513693,0
38865,0.513693,0
39235,0.513693,0
43255,0.513693,1


In [43]:
# a few more in the middle
df_pred.sort_values(['pred'], ascending=[0])[300:310]

Unnamed: 0,pred,actual
20809,0.449214,1
32354,0.449167,0
19767,0.449167,1
59604,0.447977,0
36359,0.447977,1
44410,0.447577,1
66019,0.447577,0
80238,0.447333,1
74946,0.447333,0
65067,0.445567,1
