In [1]:
import pandas as pd
print("Pandas version: " + pd.__version__)

Pandas version: 1.0.5


In [2]:
# Read in the dataframe:
covid_df = pd.read_csv("latestdata.csv", low_memory=False)

In [3]:
# Check out the head of the data and see if there are any obvious candidates to drop:
covid_df.head()

Unnamed: 0,ID,age,sex,city,province,country,latitude,longitude,geo_resolution,date_onset_symptoms,...,date_death_or_discharge,notes_for_discussion,location,admin3,admin2,admin1,country_new,admin_id,data_moderator_initials,travel_history_binary
0,000-1-1,,male,Shek Lei,Hong Kong,China,22.365019,114.133808,point,,...,,,Shek Lei,,,Hong Kong,China,8029.0,,
1,000-1-10,78.0,male,Vo Euganeo,Veneto,Italy,45.297748,11.658382,point,,...,22.02.2020,,Vo' Euganeo,,,Veneto,Italy,8954.0,,
2,000-1-100,61.0,female,,,Singapore,1.35346,103.8151,admin0,,...,17.02.2020,,,,,,Singapore,200.0,,
3,000-1-1000,,,Zhengzhou City,Henan,China,34.62931,113.468,admin2,,...,,,,,Zhengzhou City,Henan,China,10091.0,,
4,000-1-10000,,,Pingxiang City,Jiangxi,China,27.51356,113.9029,admin2,,...,,,,,Pingxiang City,Jiangxi,China,7060.0,,


In [4]:
# See the data types we are dealing with and how many rows/columns overall. Plus the names of the columns.
covid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2676311 entries, 0 to 2676310
Data columns (total 33 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   ID                        object 
 1   age                       object 
 2   sex                       object 
 3   city                      object 
 4   province                  object 
 5   country                   object 
 6   latitude                  float64
 7   longitude                 float64
 8   geo_resolution            object 
 9   date_onset_symptoms       object 
 10  date_admission_hospital   object 
 11  date_confirmation         object 
 12  symptoms                  object 
 13  lives_in_Wuhan            object 
 14  travel_history_dates      object 
 15  travel_history_location   object 
 16  reported_market_exposure  object 
 17  additional_information    object 
 18  chronic_disease_binary    bool   
 19  chronic_disease           object 
 20  source                  

<h3>There are a ton of rows (over 2 and a half million) here and a lot of columns (33). Juding by the names of some columns and the head of the dataframe, a lot of these are not going to be useful or are full of NaN values. I'll start by going through the columns and seeing what is useful and what can be dropped.

In [5]:
# Let's see how many of the values are NaN values. This will help us decide which columns to drop:
covid_df.isna().sum()

ID                                0
age                         2098293
sex                         2096154
city                         977681
province                     452664
country                         115
latitude                         61
longitude                        61
geo_resolution                   61
date_onset_symptoms         2414712
date_admission_hospital     2560100
date_confirmation            108489
symptoms                    2674259
lives_in_Wuhan              2671973
travel_history_dates        2673700
travel_history_location     2667089
reported_market_exposure    2675242
additional_information      2630456
chronic_disease_binary            0
chronic_disease             2676096
source                       566964
sequence_available          2676299
outcome                     2368929
date_death_or_discharge     2673163
notes_for_discussion        2675671
location                    2662935
admin3                      2595877
admin2                      

<h3>There are a tremendous amnount of NaN values which will present a lot of issues. We will likely have to drop most rows in order to craft a useful dataset for the models.

In [6]:
# I think we can immediately get rid of the extra geospatial information and the admin/data mod columns:
covid_df_clean = covid_df.drop(["latitude","longitude","geo_resolution","admin3","admin2","admin1",
                                "admin_id","data_moderator_initials"], axis=1)

In [7]:
# Let's see what information is in the notes column. It probably will not be useful for this project
# since I am not doing text analysis:
covid_df_notes = covid_df["notes_for_discussion"]
covid_df_notes.dropna(inplace=True)
covid_df_notes.values

array(['https://www.google.com/maps/place/Zhenlai+County,+Baicheng,+Jilin,+China/@45.9549881,122.8772653,9z/data=!3m1!4b1!4m5!3m4!1s0x5e6a4c6a8ea66823:0xfb96317ca7cfd392!8m2!3d45.847435!4d123.1998901',
       'https://new.qq.com/omn/20200215/20200215A0LD4100.html',
       'https://new.qq.com/omn/20200215/20200215A0LD4100.html',
       'https://www.google.com/maps/search/%E4%BD%8F%E5%9D%80%E4%B8%BA%E9%95%BF%E6%98%A5%E5%B8%82%E9%AB%98%E6%96%B0%E5%8C%BA%E8%9E%8D%E5%88%9B%E4%B8%8A%E5%9F%8E/@43.8296097,125.2592395,16z/data=!3m1!4b1?hl=en',
       'google.com/maps/place/Luzhou,+Sichuan,+China/@28.8805475,105.3853199,12z/data=!3m1!4b1!4m5!3m4!1s0x36ebaab481693e8d:0x57ee20f6402d239e!8m2!3d28.871569!4d105.44174',
       'https://www.zjwjw.gov.cn/art/2020/2/3/art_1202101_41869217.html',
       'I believe is Cremona, not Codogno',
       'https://www.cbc.ca/news/canada/british-columbia/bc-coronavirus-flight-montreal-vancouver-1.5473283',
       'possible double count',
       'Possible double cou

<h3>Lots and lots of text! Not useful at the moment for the scope of this project. This column can be dropped.

In [8]:
# Dropping the notes column:
covid_df_clean = covid_df_clean.drop(["notes_for_discussion"], axis=1)

In [9]:
# Review the remaining columns to see what else we can get rid of:
covid_df_clean.columns

Index(['ID', 'age', 'sex', 'city', 'province', 'country',
       'date_onset_symptoms', 'date_admission_hospital', 'date_confirmation',
       'symptoms', 'lives_in_Wuhan', 'travel_history_dates',
       'travel_history_location', 'reported_market_exposure',
       'additional_information', 'chronic_disease_binary', 'chronic_disease',
       'source', 'sequence_available', 'outcome', 'date_death_or_discharge',
       'location', 'country_new', 'travel_history_binary'],
      dtype='object')

<h3>The fact that we have a "country" column and a "country_new" column is strange. I will explore what makes them different and which one should be droppped:

In [10]:
# Let's see what information between the country and country_new columns is different:
# Start by getting both of the columns by themselves from the original dataframe into a new dataframe "country_df" and
# dropping any NaN values:
country_df = covid_df[["country","country_new"]].dropna()
#Return a dataframe that shows only the rows where the two columns differ:
country_df[country_df.country != country_df.country_new]

Unnamed: 0,country,country_new
87313,Algeria,Algiers
87324,Algeria,Algiers
87335,Algeria,Algiers
87346,Algeria,Algiers
87358,Algeria,Algiers
...,...,...
101825,Algeria,Algiers
574547,Ecuador,Ecuadar
575702,Ecuador,Ecuadar
576279,Ecuador,Ecuadar


<h3>At first glance, it seems like the only difference is that country_new has errors in spelling "Ecuador" and that it says "Algiers" (not a country!) instead of "Alegeria." I think we can get rid of country_new, then.

In [11]:
# Drop the "country_new" column:
covid_df_clean = covid_df_clean.drop("country_new", axis=1)

In [12]:
covid_df_clean.columns

Index(['ID', 'age', 'sex', 'city', 'province', 'country',
       'date_onset_symptoms', 'date_admission_hospital', 'date_confirmation',
       'symptoms', 'lives_in_Wuhan', 'travel_history_dates',
       'travel_history_location', 'reported_market_exposure',
       'additional_information', 'chronic_disease_binary', 'chronic_disease',
       'source', 'sequence_available', 'outcome', 'date_death_or_discharge',
       'location', 'travel_history_binary'],
      dtype='object')

In [13]:
symptoms = covid_df.symptoms
symptoms.dropna(inplace=True)
symptoms.values

array(['fever, severe pneumonia', 'fever', 'cough, fever, sore throat',
       ..., 'cough, fever, nausea', 'cough', 'mild'], dtype=object)

In [14]:
country_df[country_df.country != country_df.country_new].values

array([['Algeria', 'Algiers'],
       ['Algeria', 'Algiers'],
       ['Algeria', 'Algiers'],
       ['Algeria', 'Algiers'],
       ['Algeria', 'Algiers'],
       ['Algeria', 'Algiers'],
       ['Algeria', 'Algiers'],
       ['Algeria', 'Algiers'],
       ['Algeria', 'Algiers'],
       ['Algeria', 'Algiers'],
       ['Algeria', 'Algiers'],
       ['Algeria', 'Algiers'],
       ['Algeria', 'Algiers'],
       ['Algeria', 'Algiers'],
       ['Algeria', 'Algiers'],
       ['Algeria', 'Algiers'],
       ['Algeria', 'Algiers'],
       ['Algeria', 'Algiers'],
       ['Algeria', 'Algiers'],
       ['Algeria', 'Algiers'],
       ['Algeria', 'Algiers'],
       ['Algeria', 'Algiers'],
       ['Algeria', 'Algiers'],
       ['Algeria', 'Algiers'],
       ['Algeria', 'Algiers'],
       ['Algeria', 'Algiers'],
       ['Algeria', 'Algiers'],
       ['Algeria', 'Algiers'],
       ['Algeria', 'Algiers'],
       ['Algeria', 'Algiers'],
       ['Algeria', 'Algiers'],
       ['Algeria', 'Algiers'],
       [