In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [11]:
raw_data = pd.read_csv("covid19.csv")
c_data = raw_data

  interactivity=interactivity, compiler=compiler, result=result)


In [26]:
# Nice to have columns on hand
c_data.columns

Index(['ID', 'age', 'sex', 'city', 'province', 'country', 'latitude',
       'longitude', 'geo_resolution', 'date_onset_symptoms',
       'date_admission_hospital', 'date_confirmation', 'symptoms',
       'lives_in_Wuhan', 'travel_history_dates', 'travel_history_location',
       'reported_market_exposure', 'additional_information',
       'chronic_disease_binary', 'chronic_disease', 'source',
       'sequence_available', 'outcome', 'date_death_or_discharge',
       'notes_for_discussion', 'location', 'admin3', 'admin2', 'admin1',
       'country_new', 'admin_id', 'data_moderator_initials',
       'travel_history_binary', 'hospitalized', 'deceased'],
      dtype='object')

In [27]:
# Cleaning and extracting information from the "outcome" column.
# "Outcome" is a very good place to begin extracting insights.
# Lots of case outcomes to look at.
raw_data["outcome"].value_counts()

Hospitalized                                                                202475
Recovered                                                                    93656
Deceased                                                                      5011
Under treatment                                                                370
Receiving Treatment                                                            257
Alive                                                                          189
discharge                                                                      175
stable                                                                         156
died                                                                           153
stable condition                                                               130
https://www.mspbs.gov.py/covid-19.php                                          126
discharged                                                                     117
reco

In [28]:
# These functions extract keywords from the "outcome" column.
# "Outcome" gives us information we can categorize into two features:
# "deaths" and "hospitalizations".

# Look for hospital-related keywords.
# This function just wraps the stringChecker function
# and passes it a hardcoded list

#might be better to do this with regular expressions
def hospitalization(s):
    
    # Filter this value out...
    if s == "not hospitalized":
        return 0
    
    # A regular expression may be more 
    # algorithmically efficient
    hospitalStrings = [ 
       "hospital", 
       "intensive", 
       "released",
       "care", 
       "discharged",
       "intubated", 
       "migrated",
       "stable"
       "death",
       "deceased",
       "died",
   ]
    return stringChecker(s, hospitalStrings)

# Issue: we are assuming a death is a hospitalization
# Verify it is sound by checking the statistics of 
# only deaths with hospitalization dates

In [29]:
# Same as above function but for deaths.
def death(s):
    deathStrings = [
        "deceased",
        "died",
        "death",
        "dead"
    ]
    return stringChecker(s, deathStrings)

In [30]:
# Check if s contains a string in the argument list
def stringChecker(s, l):
    s = str(s).lower()
    for word in l:
      if s.find(word) != -1:
        return 1         
    return 0

In [31]:
# Create a new column for hospitalizations
# Use our above function to extract out booleans.
c_data["hospitalized"] = c_data["outcome"].apply(hospitalization)
c_data["hospitalized"].value_counts()

0    2102322
1     207789
Name: hospitalized, dtype: int64

In [32]:
# Same as above for deceased. Extract all deaths.
c_data["deceased"] = c_data["outcome"].apply(death)
c_data["deceased"].value_counts()

0    2304875
1       5236
Name: deceased, dtype: int64

In [33]:
# Cleaning up dates:
# Clean up date values
# remove hyphens, return just the first date
def cleanDate(s):
    s = str(s)
    split = s.split("-")
    return split[0].strip()

In [34]:
# Apply the above function to our three date columns.
# Thankfully our dates are already formatted nicely.
c_data["date_admission_hospital"] = c_data["date_admission_hospital"].apply(cleanDate)
c_data["date_onset_symptoms"] = c_data["date_onset_symptoms"].apply(cleanDate)
c_data["date_confirmation"] = c_data["date_confirmation"].apply(cleanDate)

In [35]:
# Visually inspect date values:
c_data["date_admission_hospital"].unique(), c_data["date_onset_symptoms"].unique(), c_data["date_confirmation"].unique()

(array(['nan', '04.02.2020', '30.01.2020', '14.02.2020', '17.02.2020',
        '13.02.2020', '10.02.2020', '12.02.2020', '15.02.2020',
        '08.02.2020', '05.02.2020', '07.02.2020', '09.02.2020',
        '26.01.2020', '26.02.2020', '03.02.2020', '28.01.2020',
        '06.02.2020', '11.02.2020', '16.02.2020', '19.02.2020',
        '22.01.2020', '18.01.2020', '23.01.2020', '20.02.2020',
        '18.02.2020', '21.02.2020', '24.01.2020', '22.02.2020',
        '29.01.2020', '27.01.2020', '23.02.2020', '24.02.2020',
        '25.02.2020', '25.01.2020', '27.02.2020', '08.01.2020',
        '15.01.2020', '17.01.2020', '28.02.2020', '20.01.2020',
        '01.03.2020', '10.01.2020', '29.03.2020', '29.02.2020',
        '13.01.2020', '03.01.2020', '04.03.2020', '03.03.2020',
        '19.01.2020', '21.01.2020', '05.03.2020', '02.03.2020',
        '03.04.2020', '06.03.2020', '31.01.2020', '08.03.2020',
        '09.03.2020', '10.03.2020', '07.03.2020', '01.02.2020',
        '02.02.2020', '14.01.2020

In [39]:
c_data.head()

Unnamed: 0,ID,age,sex,city,province,country,latitude,longitude,geo_resolution,date_onset_symptoms,...,location,admin3,admin2,admin1,country_new,admin_id,data_moderator_initials,travel_history_binary,hospitalized,deceased
0,000-1-1,,male,Shek Lei,Hong Kong,China,22.365019,114.133808,point,,...,Shek Lei,,,Hong Kong,China,8051.0,,,1,0
1,000-1-10,78.0,male,Vo Euganeo,Veneto,Italy,45.297748,11.658382,point,,...,Vo' Euganeo,,,Veneto,Italy,8978.0,,,0,1
2,000-1-100,61.0,female,,,Singapore,1.35346,103.8151,admin0,,...,,,,,Singapore,201.0,,,0,0
3,000-1-1000,,,Zhengzhou City,Henan,China,34.62931,113.468,admin2,,...,,,Zhengzhou City,Henan,China,10115.0,,,0,0
4,000-1-10000,,,Pingxiang City,Jiangxi,China,27.51356,113.9029,admin2,,...,,,Pingxiang City,Jiangxi,China,7079.0,,,0,0


In [41]:
c_data.to_csv("covid19_cleaned.csv")