In [1]:
# Real World Example using Covid Data
#
# Objectives: 
#    Import a csv file containing covid testing results and data
#    Analyse the data using SQL, Python, and Pandas
#    Plot the results

In [2]:
# Data
#
# we will be using csv files from from https://covidclinicaldata.org/ 
#

In [3]:
import pandas as pd
import glob
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

In [4]:
df = pd.read_csv('data/05-26_carbonhealth_and_braidhealth.csv')

In [5]:
pd.set_option('display.max_columns', None)

In [6]:
df.head(5)

Unnamed: 0,batch_date,test_name,swab_type,covid19_test_results,age,high_risk_exposure_occupation,high_risk_interactions,diabetes,chd,htn,cancer,asthma,copd,autoimmune_dis,smoker,temperature,pulse,sys,dia,rr,sats,rapid_flu_results,rapid_strep_results,ctab,labored_respiration,rhonchi,wheezes,days_since_symptom_onset,cough,cough_severity,fever,sob,sob_severity,diarrhea,fatigue,headache,loss_of_smell,loss_of_taste,runny_nose,muscle_sore,sore_throat,cxr_findings,cxr_impression,cxr_label,cxr_link,er_referral
0,2020-05-26,SARS COV2 NAAT,Nasopharyngeal,Negative,31,False,,False,False,False,False,False,False,False,False,37.1,63.0,108.0,72.0,17.0,98.0,,,,False,,,,False,,,False,,False,False,False,False,False,False,False,False,,,,,False
1,2020-05-26,SARS COV2 NAAT,Nasopharyngeal,Negative,54,False,True,False,False,True,False,False,False,False,False,36.75,71.0,121.0,74.0,16.0,100.0,,,False,False,False,False,,False,,False,False,,False,False,False,False,False,False,False,False,,,,,False
2,2020-05-26,SARS COV2 NAAT,Nasopharyngeal,Negative,32,False,False,False,False,False,False,False,False,False,False,36.45,62.0,111.0,74.0,16.0,98.0,,,,False,,,,False,,False,False,,False,False,False,False,False,False,False,False,,,,,False
3,2020-05-26,SARS COV2 NAAT,Nasopharyngeal,Negative,25,False,False,False,False,False,False,False,False,False,False,36.85,95.0,119.0,75.0,16.0,98.0,,,False,,,False,3.0,True,,True,True,,False,False,True,False,False,False,False,False,,,,,False
4,2020-05-26,"SARS-CoV-2, NAA",Nasopharyngeal,Negative,33,False,False,False,False,False,False,False,False,False,False,36.75,70.0,122.0,82.0,12.0,100.0,,,False,False,False,False,,False,,,False,,False,False,False,False,False,False,False,False,,,,,False


In [7]:
df.columns

Index(['batch_date', 'test_name', 'swab_type', 'covid19_test_results', 'age',
       'high_risk_exposure_occupation', 'high_risk_interactions', 'diabetes',
       'chd', 'htn', 'cancer', 'asthma', 'copd', 'autoimmune_dis', 'smoker',
       'temperature', 'pulse', 'sys', 'dia', 'rr', 'sats', 'rapid_flu_results',
       'rapid_strep_results', 'ctab', 'labored_respiration', 'rhonchi',
       'wheezes', 'days_since_symptom_onset', 'cough', 'cough_severity',
       'fever', 'sob', 'sob_severity', 'diarrhea', 'fatigue', 'headache',
       'loss_of_smell', 'loss_of_taste', 'runny_nose', 'muscle_sore',
       'sore_throat', 'cxr_findings', 'cxr_impression', 'cxr_label',
       'cxr_link', 'er_referral'],
      dtype='object')

In [8]:
# get symptoms

In [9]:
df_symptoms = pysqldf("""
SELECT
    covid19_test_results, labored_respiration, rhonchi,
    wheezes, cough, cough_severity,
    fever, sob, sob_severity, diarrhea, fatigue, headache,
    loss_of_smell, loss_of_taste, runny_nose, muscle_sore,
    sore_throat
FROM
    df
""")

In [10]:
df_symptoms.head()

Unnamed: 0,covid19_test_results,labored_respiration,rhonchi,wheezes,cough,cough_severity,fever,sob,sob_severity,diarrhea,fatigue,headache,loss_of_smell,loss_of_taste,runny_nose,muscle_sore,sore_throat
0,Negative,0.0,,,0.0,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Negative,0.0,0.0,0.0,0.0,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Negative,0.0,,,0.0,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Negative,,,0.0,1.0,,1.0,1.0,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,Negative,0.0,0.0,0.0,0.0,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# note that we have different data types, mixing text and numerical data.
# to describe all data types, set include='all' in the describe method
# by default, describe() will only summarize numerical data

In [12]:
df_symptoms.describe(include='all')

Unnamed: 0,covid19_test_results,labored_respiration,rhonchi,wheezes,cough,cough_severity,fever,sob,sob_severity,diarrhea,fatigue,headache,loss_of_smell,loss_of_taste,runny_nose,muscle_sore,sore_throat
count,750,428.0,295.0,314.0,746.0,55,522.0,741.0,25,741.0,741.0,741.0,741.0,741.0,741.0,741.0,741.0
unique,2,,,,,3,,,2,,,,,,,,
top,Negative,,,,,Mild,,,Mild,,,,,,,,
freq,716,,,,,40,,,18,,,,,,,,
mean,,0.002336,0.138983,0.178344,0.092493,,0.047893,0.036437,,0.020243,0.059379,0.071525,0.014845,0.006748,0.053981,0.051282,0.059379
std,,0.048337,0.346517,0.383413,0.289915,,0.213744,0.187502,,0.140925,0.236493,0.257874,0.121013,0.081922,0.226133,0.220721,0.236493
min,,0.0,0.0,0.0,0.0,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,0.0,0.0,0.0,0.0,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,,0.0,0.0,0.0,0.0,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,,0.0,0.0,0.0,0.0,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# Exercise - what are the possible value types for covid19_test_results, cough_severity, and sob_severity?
# how could you answer this with SQL?
# how could you answer this with pandas?

In [14]:
# Analysis
#
# let's investigate the effect of fatigue on Covid Test Results

In [15]:
sql = """
SELECT
    *
FROM
    df_symptoms
WHERE
    fatigue = 1
"""

r = pysqldf(sql)

In [16]:
r.head()

Unnamed: 0,covid19_test_results,labored_respiration,rhonchi,wheezes,cough,cough_severity,fever,sob,sob_severity,diarrhea,fatigue,headache,loss_of_smell,loss_of_taste,runny_nose,muscle_sore,sore_throat
0,Negative,0.0,0.0,0.0,1.0,Moderate,0.0,0.0,,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
1,Negative,,,,1.0,,0.0,1.0,,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
2,Negative,0.0,1.0,1.0,1.0,Mild,0.0,1.0,Moderate,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
3,Positive,,,,1.0,,,0.0,,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,Negative,0.0,0.0,0.0,1.0,Mild,,1.0,Mild,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0


In [17]:
# we can now get results only for test results where the patient indicated fatigue

In [18]:
r.describe()

Unnamed: 0,labored_respiration,rhonchi,wheezes,cough,fever,sob,diarrhea,fatigue,headache,loss_of_smell,loss_of_taste,runny_nose,muscle_sore,sore_throat
count,37.0,32.0,32.0,44.0,36.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0
mean,0.0,0.0625,0.0625,0.5,0.416667,0.295455,0.204545,1.0,0.659091,0.113636,0.090909,0.363636,0.568182,0.409091
std,0.0,0.245935,0.245935,0.505781,0.5,0.461522,0.408032,0.0,0.479495,0.321038,0.290803,0.486607,0.501056,0.49735
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.5,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
75%,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0
max,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [19]:
# exercise - generate summary statistics for other possible predictors.
# note - you may want to go back and modify the original query
# for example, you might want to consider high risk occupations, contacts, or ER referrals

In [20]:
# Using GROUP BY queries
#
# we can use a SQL query to investigate the relationshop between covid test results and one or more symptoms

In [21]:
# single variable case
pysqldf("""
SELECT 
    covid19_test_results, COUNT(covid19_test_results)
FROM
    df_symptoms
GROUP BY 
    covid19_test_results
""")

Unnamed: 0,covid19_test_results,COUNT(covid19_test_results)
0,Negative,716
1,Positive,34


In [22]:
# grouping on multiple variables
pysqldf("""
SELECT 
    covid19_test_results, fatigue, COUNT(covid19_test_results)
FROM
    df_symptoms
GROUP BY 
    covid19_test_results, fatigue
""")

Unnamed: 0,covid19_test_results,fatigue,COUNT(covid19_test_results)
0,Negative,,9
1,Negative,0.0,666
2,Negative,1.0,41
3,Positive,0.0,31
4,Positive,1.0,3


In [23]:
# Exercise - Does this data set show a higher probability for a positive test result 
# for patients reporting fatigue?

In [24]:
# Exercise - write use queries to find symptoms or other predictors (or combinations of them)

In [25]:
# Question (more to consider than answer). What are the odds of a positive result for someone reporting
# all measured symptoms. Can you answer this with SQL? 