In [181]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from fuzzywuzzy import fuzz

In [182]:
raw_data = pd.read_csv("covid19.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [183]:
c_data = pd.read_csv("covid19_cleaned.zip")

  interactivity=interactivity, compiler=compiler, result=result)


In [184]:
# Nice to have columns on hand
c_data.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'ID', 'age', 'sex',
       'city', 'province', 'country', 'latitude', 'longitude',
       'geo_resolution', 'date_onset_symptoms', 'date_admission_hospital',
       'date_confirmation', 'symptoms', 'lives_in_Wuhan',
       'travel_history_dates', 'travel_history_location',
       'reported_market_exposure', 'additional_information',
       'chronic_disease_binary', 'chronic_disease', 'source',
       'sequence_available', 'outcome', 'date_death_or_discharge',
       'notes_for_discussion', 'location', 'admin3', 'admin2', 'admin1',
       'country_new', 'admin_id', 'data_moderator_initials',
       'travel_history_binary', 'hospitalized', 'deceased', 'fever', 'cough',
       'fatigue', 'headache', 'dizziness', 'sore throat', 'pneumonia',
       'respiratory', 'nausea', 'diarrhea'],
      dtype='object')

In [185]:
# Cleaning and extracting information from the "outcome" column.
# "Outcome" is a very good place to begin extracting insights.
# Lots of case outcomes to look at.
raw_data["outcome"].value_counts()

Hospitalized                                                                202475
Recovered                                                                    93656
Deceased                                                                      5011
Under treatment                                                                370
Receiving Treatment                                                            257
Alive                                                                          189
discharge                                                                      175
stable                                                                         156
died                                                                           153
stable condition                                                               130
https://www.mspbs.gov.py/covid-19.php                                          126
discharged                                                                     117
reco

In [186]:
# These functions extract keywords from the "outcome" column.
# "Outcome" gives us information we can categorize into two features:
# "deaths" and "hospitalizations".

# Look for hospital-related keywords.
# This function just wraps the stringChecker function
# and passes it a hardcoded list

#might be better to do this with regular expressions
def hospitalization(s):
    
    # Filter this value out...
    if s == "not hospitalized":
        return 0
    
    # A regular expression may be more 
    # algorithmically efficient
    hospitalStrings = [ 
       "hospital", 
       "intensive", 
       "released",
       "care", 
       "discharged",
       "intubated", 
       "migrated",
       "stable"
       "death",
       "deceased",
       "died",
   ]
    return stringChecker(s, hospitalStrings)

# Issue: we are assuming a death is a hospitalization
# Verify it is sound by checking the statistics of 
# only deaths with hospitalization dates

In [187]:
# Same as above function but for deaths.
def death(s):
    deathStrings = [
        "deceased",
        "died",
        "death",
        "dead"
    ]
    return stringChecker(s, deathStrings)

In [188]:
# Check if s contains a string in the argument list
def stringChecker(s, l):
    s = str(s).lower()
    for word in l:
      if s.find(word) != -1:
        return 1         
    return 0

In [189]:
# Create a new column for hospitalizations
# Use our above function to extract out booleans.
c_data["hospitalized"] = raw_data["outcome"].apply(hospitalization)
c_data["hospitalized"].value_counts()

0    2102322
1     207789
Name: hospitalized, dtype: int64

In [190]:
# Same as above for deceased. Extract all deaths.
c_data["deceased"] = raw_data["outcome"].apply(death)
c_data["deceased"].value_counts()

0    2304875
1       5236
Name: deceased, dtype: int64

In [191]:
# Cleaning up dates:
# Clean up date values
# remove hyphens, return just the first date
def cleanDate(s):
    s = str(s)
    split = s.split("-")
    return split[0].strip()

In [192]:
# Apply the above function to our three date columns.
# Thankfully our dates are already formatted nicely.
c_data["date_admission_hospital"] = raw_data["date_admission_hospital"].apply(cleanDate)
c_data["date_onset_symptoms"] = raw_data["date_onset_symptoms"].apply(cleanDate)
c_data["date_confirmation"] = raw_data["date_confirmation"].apply(cleanDate)

In [193]:
# Visually inspect date values:
c_data["date_admission_hospital"].unique(), c_data["date_onset_symptoms"].unique(), c_data["date_confirmation"].unique()

(array(['nan', '04.02.2020', '30.01.2020', '14.02.2020', '17.02.2020',
        '13.02.2020', '10.02.2020', '12.02.2020', '15.02.2020',
        '08.02.2020', '05.02.2020', '07.02.2020', '09.02.2020',
        '26.01.2020', '26.02.2020', '03.02.2020', '28.01.2020',
        '06.02.2020', '11.02.2020', '16.02.2020', '19.02.2020',
        '22.01.2020', '18.01.2020', '23.01.2020', '20.02.2020',
        '18.02.2020', '21.02.2020', '24.01.2020', '22.02.2020',
        '29.01.2020', '27.01.2020', '23.02.2020', '24.02.2020',
        '25.02.2020', '25.01.2020', '27.02.2020', '08.01.2020',
        '15.01.2020', '17.01.2020', '28.02.2020', '20.01.2020',
        '01.03.2020', '10.01.2020', '29.03.2020', '29.02.2020',
        '13.01.2020', '03.01.2020', '04.03.2020', '03.03.2020',
        '19.01.2020', '21.01.2020', '05.03.2020', '02.03.2020',
        '03.04.2020', '06.03.2020', '31.01.2020', '08.03.2020',
        '09.03.2020', '10.03.2020', '07.03.2020', '01.02.2020',
        '02.02.2020', '14.01.2020

In [194]:
c_data["symptoms"].value_counts()

fever                                                    320
Mild to moderate                                         257
Mild:moderate                                            128
cough, fever                                             126
cough                                                     52
                                                        ... 
acute respiratory distress syndrome:septic shock           1
acute respiratory distress syndrome:pneumonia:hypoxia      1
chills, conjunctivitis, cough, fever                       1
mild, runny nose                                           1
fever, cough, sore throat                                  1
Name: symptoms, Length: 448, dtype: int64

In [195]:
len(c_data["symptoms"])

2310111

In [196]:
# Data extraction from symptoms
c_data["symptoms"].unique()

array([nan, 'fever, severe pneumonia', 'fever',
       'cough, fever, sore throat', 'cough, fever', 'dyspnea, fever',
       'cough, difficulty breathing, fever', 'fever (38 ° C)',
       '37.1 ° C, mild coughing', 'cough',
       'abdominal pain, pulmonary inflammation', 'Sore throat',
       'feeling ill, coughing',
       'abdominal pain, diarrhea, fever, general malaise',
       'poor physical condition', 'cough, fatigue, fever', 'yes',
       'fever 38.3', 'fever (39.5 ℃)', 'fever 37.7℃', 'fever 37.5℃',
       'anorexia', 'nausea, cough, pneumonia, fever',
       'fever 38.2 ℃, cough', 'fever 37 C', 'malaise, headache',
       'fever (37 ℃), breathing difficulty',
       'fever (37.5 ℃, then 38 ℃), cough, malaise',
       'sore throat, Cough, fever (37 ℃)',
       'fever, shortness of breath, pneumonia', 'severe dyspnea',
       'fever, cough', 'nausea', 'fever; fatigue',
       'chest tightness; fatigue', 'cough; fever',
       'fever; nasal congestion; runny nose; sore throat; c

In [197]:
noNullSymptoms["symptoms"].sample(10)

391550                                                fever
143131                                                fever
139932                                                fever
3223                                         fatigue, fever
223644    cardiogenic shock:acute coronary syndrome:hear...
34738                                          cough, fever
214020                                                fever
43552                          fever, joint pain, pneumonia
585153     Symptoms associated with a respiratory condition
5326                                                  cough
Name: symptoms, dtype: object

In [198]:
fuzz.ratio("chills", "pneumonitis")

24

In [199]:
def extractFuzzy(rowVal, testStrings):
    for s in testStrings:
        if (fuzz.ratio(s, rowVal) > 50):
            return 1
    return 0

In [200]:
# This is a boolean array that will select
# Only the columns in our data which
# contain symptom values.
indexSymptoms = np.logical_not(np.array(c_data["symptoms"].isna()))

In [201]:
c_data.loc[indexSymptoms,"symptoms"]

476         fever, severe pneumonia
504                           fever
756       cough, fever, sore throat
765                    cough, fever
767                  dyspnea, fever
                    ...            
629940                        cough
642814                         mild
643762         cough, fever, nausea
643772                        cough
645994                         mild
Name: symptoms, Length: 2053, dtype: object

In [234]:
c_data.loc[indexSymptoms,"fever"] = c_data.loc[indexSymptoms,"symptoms"].apply(stringChecker, args=(["fever"]))
c_data.loc[indexSymptoms,"cough"] = c_data.loc[indexSymptoms,"symptoms"].apply(stringChecker, args=([["cough", "phlegm"]]))
c_data.loc[indexSymptoms,"fatigue"] = c_data.loc[indexSymptoms,"symptoms"].apply(stringChecker, args=([["fatigue", "malaise", "weakness", "chest"]]))
c_data.loc[indexSymptoms,"headache"] = c_data.loc[indexSymptoms,"symptoms"].apply(stringChecker, args=([["headache"]]))
c_data.loc[indexSymptoms,"dizziness"] = c_data.loc[indexSymptoms,"symptoms"].apply(stringChecker, args=([["dizzy", "vertigo", "dizziness", "dizz"]]))
c_data.loc[indexSymptoms,"sore throat"] = c_data.loc[indexSymptoms,"symptoms"].apply(stringChecker, args=([["sore throat", "throat"]]))
c_data.loc[indexSymptoms,"pneumonia"] = c_data.loc[indexSymptoms,"symptoms"].apply(stringChecker, args=([["pneumonia"]]))
c_data.loc[indexSymptoms,"respiratory"] = c_data.loc[indexSymptoms,"symptoms"].apply(extractFuzzy, args=([["mild respiratory"]]))
c_data.loc[indexSymptoms,"severe_r"] = c_data.loc[indexSymptoms,"symptoms"].apply(extractFuzzy, args=([["acute respiratory", "severe respiratory"]]))
c_data.loc[indexSymptoms,"nausea"] = c_data.loc[indexSymptoms,"symptoms"].apply(stringChecker, args=([["nausea", "vomiting"]]))
c_data.loc[indexSymptoms,"diarrhea"] = c_data.loc[indexSymptoms,"symptoms"].apply(stringChecker, args=([["diarrhea"]]))
c_data[indexSymptoms].head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,ID,age,sex,city,province,country,latitude,...,cough,fatigue,headache,dizziness,sore throat,pneumonia,respiratory,nausea,diarrhea,severe_r
476,476,476,476,000-1-10426,60,male,Yokohama,Kanagawa Prefecture,Japan,35.45085,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
504,504,504,504,000-1-10451,40-49,male,,Saitama,Japan,35.99736,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
756,756,756,756,000-1-10679,20-29,male,Haneda Airport,Tokyo,Japan,35.553333,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
765,765,765,765,000-1-10687,40-49,male,,Tokyo,Japan,35.71145,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
767,767,767,767,000-1-10689,80-89,male,,Tokyo,Japan,35.71145,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [232]:
c_data.loc[indexSymptoms,"fever"].value_counts()

1.0    1884
0.0     169
Name: fever, dtype: int64

In [231]:
c_data.loc[indexSymptoms, "pneumonia"].value_counts()

0.0    1943
1.0     110
Name: pneumonia, dtype: int64

In [230]:
c_data.loc[indexSymptoms, "respiratory"].value_counts()

0.0    2002
1.0      51
Name: respiratory, dtype: int64

In [229]:
c_data.loc[indexSymptoms, "severe_r"].value_counts()

0.0    1951
1.0     102
Name: severe_r, dtype: int64

In [228]:
c_data.loc[indexSymptoms, "cough"].value_counts()

0.0    1451
1.0     602
Name: cough, dtype: int64

In [235]:
c_data.loc[indexSymptoms, "diarrhea"].value_counts()

0.0    2031
1.0      22
Name: diarrhea, dtype: int64

In [236]:
c_data.loc[indexSymptoms, "sore throat"].value_counts()

0.0    1931
1.0     122
Name: sore throat, dtype: int64

In [233]:
c_data.to_csv("covid19_cleaned.zip", compression="zip")