In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from fuzzywuzzy import fuzz



In [31]:
raw_data = pd.read_csv("covid19.csv")

In [5]:
c_data = pd.read_csv("covid19_cleaned.zip")

  interactivity=interactivity, compiler=compiler, result=result)


In [13]:
# Nice to have columns on hand
c_data.columns

Index(['Unnamed: 0', 'ID', 'age', 'sex', 'city', 'province', 'country',
       'latitude', 'longitude', 'geo_resolution', 'date_onset_symptoms',
       'date_admission_hospital', 'date_confirmation', 'symptoms',
       'lives_in_Wuhan', 'travel_history_dates', 'travel_history_location',
       'reported_market_exposure', 'additional_information',
       'chronic_disease_binary', 'chronic_disease', 'source',
       'sequence_available', 'outcome', 'date_death_or_discharge',
       'notes_for_discussion', 'location', 'admin3', 'admin2', 'admin1',
       'country_new', 'admin_id', 'data_moderator_initials',
       'travel_history_binary', 'hospitalized', 'deceased'],
      dtype='object')

In [14]:
# Cleaning and extracting information from the "outcome" column.
# "Outcome" is a very good place to begin extracting insights.
# Lots of case outcomes to look at.
raw_data["outcome"].value_counts()

Hospitalized                                                                202475
Recovered                                                                    93656
Deceased                                                                      5011
Under treatment                                                                370
Receiving Treatment                                                            257
Alive                                                                          189
discharge                                                                      175
stable                                                                         156
died                                                                           153
stable condition                                                               130
https://www.mspbs.gov.py/covid-19.php                                          126
discharged                                                                     117
reco

In [17]:
# These functions extract keywords from the "outcome" column.
# "Outcome" gives us information we can categorize into two features:
# "deaths" and "hospitalizations".

# Look for hospital-related keywords.
# This function just wraps the stringChecker function
# and passes it a hardcoded list

#might be better to do this with regular expressions
def hospitalization(s):
    
    # Filter this value out...
    if s == "not hospitalized":
        return 0
    
    # A regular expression may be more 
    # algorithmically efficient
    hospitalStrings = [ 
       "hospital", 
       "intensive", 
       "released",
       "care", 
       "discharged",
       "intubated", 
       "migrated",
       "stable"
       "death",
       "deceased",
       "died",
   ]
    return stringChecker(s, hospitalStrings)

# Issue: we are assuming a death is a hospitalization
# Verify it is sound by checking the statistics of 
# only deaths with hospitalization dates

In [18]:
# Same as above function but for deaths.
def death(s):
    deathStrings = [
        "deceased",
        "died",
        "death",
        "dead"
    ]
    return stringChecker(s, deathStrings)

In [19]:
# Check if s contains a string in the argument list
def stringChecker(s, l):
    s = str(s).lower()
    for word in l:
      if s.find(word) != -1:
        return 1         
    return 0

In [20]:
# Create a new column for hospitalizations
# Use our above function to extract out booleans.
c_data["hospitalized"] = raw_data["outcome"].apply(hospitalization)
c_data["hospitalized"].value_counts()

0    2102322
1     207789
Name: hospitalized, dtype: int64

In [21]:
# Same as above for deceased. Extract all deaths.
c_data["deceased"] = raw_data["outcome"].apply(death)
c_data["deceased"].value_counts()

0    2304875
1       5236
Name: deceased, dtype: int64

In [22]:
# Cleaning up dates:
# Clean up date values
# remove hyphens, return just the first date
def cleanDate(s):
    s = str(s)
    split = s.split("-")
    return split[0].strip()

In [23]:
# Apply the above function to our three date columns.
# Thankfully our dates are already formatted nicely.
c_data["date_admission_hospital"] = raw_data["date_admission_hospital"].apply(cleanDate)
c_data["date_onset_symptoms"] = raw_data["date_onset_symptoms"].apply(cleanDate)
c_data["date_confirmation"] = raw_data["date_confirmation"].apply(cleanDate)

In [24]:
# Visually inspect date values:
c_data["date_admission_hospital"].unique(), c_data["date_onset_symptoms"].unique(), c_data["date_confirmation"].unique()

(array(['nan', '04.02.2020', '30.01.2020', '14.02.2020', '17.02.2020',
        '13.02.2020', '10.02.2020', '12.02.2020', '15.02.2020',
        '08.02.2020', '05.02.2020', '07.02.2020', '09.02.2020',
        '26.01.2020', '26.02.2020', '03.02.2020', '28.01.2020',
        '06.02.2020', '11.02.2020', '16.02.2020', '19.02.2020',
        '22.01.2020', '18.01.2020', '23.01.2020', '20.02.2020',
        '18.02.2020', '21.02.2020', '24.01.2020', '22.02.2020',
        '29.01.2020', '27.01.2020', '23.02.2020', '24.02.2020',
        '25.02.2020', '25.01.2020', '27.02.2020', '08.01.2020',
        '15.01.2020', '17.01.2020', '28.02.2020', '20.01.2020',
        '01.03.2020', '10.01.2020', '29.03.2020', '29.02.2020',
        '13.01.2020', '03.01.2020', '04.03.2020', '03.03.2020',
        '19.01.2020', '21.01.2020', '05.03.2020', '02.03.2020',
        '03.04.2020', '06.03.2020', '31.01.2020', '08.03.2020',
        '09.03.2020', '10.03.2020', '07.03.2020', '01.02.2020',
        '02.02.2020', '14.01.2020

In [39]:
c_data["symptoms"].value_counts()

fever                                              320
Mild to moderate                                   257
Mild:moderate                                      128
cough, fever                                       126
cough                                               52
                                                  ... 
fever:headache:malaise:sore throat:cough             1
fever:cough:acute respiratory distress syndrome      1
cough, fever, grasp                                  1
acute myocardial infarction                          1
cough:cold:fever:aches:headache                      1
Name: symptoms, Length: 448, dtype: int64

In [65]:
len(c_data["symptoms"])

2310111

In [38]:
# Data extraction from symptoms
c_data["symptoms"].unique()

array([nan, 'fever, severe pneumonia', 'fever',
       'cough, fever, sore throat', 'cough, fever', 'dyspnea, fever',
       'cough, difficulty breathing, fever', 'fever (38 ° C)',
       '37.1 ° C, mild coughing', 'cough',
       'abdominal pain, pulmonary inflammation', 'Sore throat',
       'feeling ill, coughing',
       'abdominal pain, diarrhea, fever, general malaise',
       'poor physical condition', 'cough, fatigue, fever', 'yes',
       'fever 38.3', 'fever (39.5 ℃)', 'fever 37.7℃', 'fever 37.5℃',
       'anorexia', 'nausea, cough, pneumonia, fever',
       'fever 38.2 ℃, cough', 'fever 37 C', 'malaise, headache',
       'fever (37 ℃), breathing difficulty',
       'fever (37.5 ℃, then 38 ℃), cough, malaise',
       'sore throat, Cough, fever (37 ℃)',
       'fever, shortness of breath, pneumonia', 'severe dyspnea',
       'fever, cough', 'nausea', 'fever; fatigue',
       'chest tightness; fatigue', 'cough; fever',
       'fever; nasal congestion; runny nose; sore throat; c

In [37]:
noNullSymptoms["symptoms"].sample(10)

253576                    pneumonitis
290019    fever, respiratory symptoms
127521                          fever
108603               Mild to moderate
381573                   cough, fever
108580               Mild to moderate
130133                     fever 38.3
426924      cough, fever, sore throat
12578                    fever, cough
108555               Mild to moderate
Name: symptoms, dtype: object

In [39]:
fuzz.ratio("chills", "pneumonitis")

24

In [61]:
def extractFuzzy(rowVal, testStrings):
    for s in testStrings:
        if (fuzz.ratio(s, rowVal) > 90):
            return 1
    return 0

In [92]:
# This is a boolean array that will select
# Only the columns in our data which
# contain symptom values.
indexSymptoms = np.logical_not(np.array(c_data["symptoms"].isna()))

In [84]:
c_data.loc[indexSymptoms,"symptoms"]

476         fever, severe pneumonia
504                           fever
756       cough, fever, sore throat
765                    cough, fever
767                  dyspnea, fever
                    ...            
629940                        cough
642814                         mild
643762         cough, fever, nausea
643772                        cough
645994                         mild
Name: symptoms, Length: 2053, dtype: object

In [89]:
c_data.loc[indexSymptoms,"fever"] = c_data.loc[indexSymptoms,"symptoms"].apply(extractFuzzy, args=([["fever", "chills"]]))
c_data[indexSymptoms].head()

NameError: name 'npSymptomIndieces' is not defined

In [64]:
noNullSymptoms["fever"].value_counts()

0    1733
1     320
Name: fever, dtype: int64

In [34]:
c_data.to_csv("covid19_cleaned.zip", compression="zip")