In [None]:
import pandas as pd
import glob

In [None]:
all_files = glob.glob('data/*.csv')
df = pd.concat((pd.read_csv(f) for f in all_files), sort=False)

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df.head()

In [None]:
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

In [None]:
df_symptoms = pysqldf("""
SELECT
    covid19_test_results = 'Positive' as R,
    temperature,
    high_risk_exposure_occupation,
    high_risk_interactions,
    labored_respiration,
    rhonchi,
    cough,
    fever,
    sob,
    diarrhea,
    fatigue,
    headache,
    loss_of_smell,
    loss_of_taste,
    runny_nose,
    sore_throat
FROM
    df
GROUP BY
    temperature,
    high_risk_exposure_occupation,
    high_risk_interactions,
    labored_respiration,
    rhonchi,
    cough,
    fever,
    sob,
    diarrhea,
    fatigue,
    headache,
    loss_of_smell,
    loss_of_taste,
    runny_nose,
    sore_throat
HAVING COUNT(*) >= 1
ORDER BY COUNT(*) DESC
""")

In [None]:
df_symptoms.head()

In [None]:
df_symptoms = df_symptoms.fillna(df_symptoms.mean())

In [None]:
df_symptoms.head()

In [None]:
df_symptoms.describe()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [None]:
#df_symptoms.dropna(inplace=True) # leaves 1946

In [None]:
# normalize the temperaature
df_symptoms['temperature'] = (df_symptoms['temperature'] - df_symptoms['temperature'].min()) / (df_symptoms['temperature'].max() - df_symptoms['temperature'].min())

In [None]:
df_symptoms.head(5)

In [None]:
#df_symptoms[df_symptoms['R'] == 1]
df_symptoms["id"] = df_symptoms.index

In [None]:
len(df_symptoms)

In [None]:
#df_symptoms.head()

In [None]:
symptom_clf = Pipeline([
    ('clf', RandomForestClassifier())
    ])

In [None]:
# -1 cause don't include the id
symptom_clf.fit(df_symptoms.iloc[:,1:-1], df_symptoms['R'])

In [None]:
symptom_clf['clf'].feature_importances_

In [None]:
feature_df = pd.DataFrame({'feature':df_symptoms.columns[1:-1], 
                           'importance': symptom_clf['clf'].feature_importances_})

In [None]:
pysqldf("SELECT * FROM feature_df ORDER BY importance DESC")

In [None]:
df_symptoms.iloc[:,1:-1].head()

In [None]:
symptom_scores = cross_val_score(symptom_clf, df_symptoms.iloc[:,1:], df_symptoms['R'], cv=5)

In [None]:
print("Accuracy: %0.2f, StDev: %0.2f)" % (symptom_scores.mean(), symptom_scores.std()))

In [None]:
test_predicted_proba = symptom_clf.predict_proba(df_symptoms.iloc[:, 1:-1])

In [None]:
test_predicted = symptom_clf.predict(df_symptoms.iloc[:, 1:-1])

In [None]:
df_pred = pd.DataFrame({"p_cat":test_predicted, "p_prob":[item[1] for item in test_predicted_proba], "p_id":df_symptoms.id })

In [None]:
pysqldf("SELECT * FROM df_pred ORDER BY p_prob DESC")

In [None]:
symptom_clf.predict_proba([[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]])

In [None]:
symptom_clf.predict_proba([[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]])

In [None]:
symptom_clf.predict_proba([list(df_symptoms.iloc[123][1:-1])])

In [None]:
symptom_clf.predict_proba([[1,1,0,0,0,0,0,0,0,0,0,0,0,0,0]])