In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys, os
import glob

In [2]:
# Take all 217 csv files and merge it into one big dataframe
# Eventually we would want to split the dataframe into three csvs - 
# canada, usa, and north america

In [3]:
# Retrieved from: https://stackoverflow.com/questions/58274401/importing-multiple-csv-files-into-pandas-and-merge-them-into-one-dataframe
path = os.getcwd() # Find path of current working directory
all_files = glob.glob(path + "/*.csv")
dfs = list()

# Add all data files into one list and merge into one big df
for file in all_files:
    df = pd.read_csv(file)
    
    # Fix columns with different names but same content
    # i.e. Country_Region -> Country/Region
    if set(['FIPS','Admin2']).issubset(df.columns):
        df = df.drop(['FIPS', 'Admin2'], axis = 1)
    if 'Province_State' in df:
        df = df.rename(columns = {'Province_State': 'Province/State'})
    if 'Country_Region' in df:
        df = df.rename(columns = {'Country_Region': 'Country/Region'})
    if 'Last_Update' in df:
        df = df.rename(columns = {'Last_Update': 'Last Update'})
    if 'Lat' in df:
        df = df.rename(columns = {'Lat': 'Latitude'})
    if 'Long_' in df:
        df = df.rename(columns = {'Long_': 'Longitude'})
    dfs.append(df)

frame = pd.concat(dfs, ignore_index = True, axis = 0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [4]:
# Unfiltered df
frame

Unnamed: 0.1,Active,Case-Fatality_Ratio,Combined_Key,Confirmed,Country/Region,Deaths,Incidence_Rate,Last Update,Latitude,Longitude,Province/State,Recovered,Unnamed: 0
0,5092.0,3.706933,Afghanistan,40141,Afghanistan,1488,103.115106,2020-10-18 04:24:25,33.939110,67.709953,,33561,
1,6325.0,2.670800,Albania,16774,Albania,448,582.875808,2020-10-18 04:24:25,41.153300,20.168300,,10001,
2,14386.0,3.405716,Algeria,54203,Algeria,1846,123.607094,2020-10-18 04:24:25,28.033900,1.659600,,37971,
3,1261.0,1.747113,Andorra,3377,Andorra,59,4370.672361,2020-10-18 04:24:25,42.506300,1.521800,,2057,
4,4199.0,3.229697,Angola,7462,Angola,241,22.704129,2020-10-18 04:24:25,-11.202700,17.873900,,3022,
5,15.0,2.521008,Antigua and Barbuda,119,Antigua and Barbuda,3,121.517850,2020-10-18 04:24:25,17.060800,-61.796400,,101,
6,161838.0,2.666377,Argentina,979119,Argentina,26107,2166.394882,2020-10-18 04:24:25,-38.416100,-63.616700,,791174,
7,14008.0,1.693651,Armenia,63000,Armenia,1067,2126.055519,2020-10-18 04:24:25,40.069100,45.038200,,47925,
8,0.0,2.654867,"Australian Capital Territory, Australia",113,Australia,3,26.395702,2020-10-18 04:24:25,-35.473500,149.012400,Australian Capital Territory,110,
9,1143.0,1.221761,"New South Wales, Australia",4338,Australia,53,53.436807,2020-10-18 04:24:25,-33.868800,151.209300,New South Wales,3142,


In [5]:
# North America Cases - US/NA, not counting the Diamond Princess Cruise cases
# Omit Grand Princess cruise cases?
frame_filtered = frame.loc[(frame["Country/Region"] == "US") | (frame["Country/Region"] == "Canada")]
frame_filtered = frame_filtered.loc[(frame_filtered["Province/State"] != "Diamond Princess")].reset_index(drop = True)

In [6]:
frame_filtered

Unnamed: 0.1,Active,Case-Fatality_Ratio,Combined_Key,Confirmed,Country/Region,Deaths,Incidence_Rate,Last Update,Latitude,Longitude,Province/State,Recovered,Unnamed: 0
0,2836.0,1.322618,"Alberta, Canada",21775,Canada,288,493.412183,2020-10-18 04:24:25,53.933300,-116.576500,Alberta,18651,
1,1513.0,2.243275,"British Columbia, Canada",11189,Canada,251,218.923532,2020-10-18 04:24:25,53.726700,-127.647600,British Columbia,9425,
2,0.0,0.000000,"Grand Princess, Canada",13,Canada,0,,2020-10-18 04:24:25,,,Grand Princess,13,
3,1582.0,1.197605,"Manitoba, Canada",3173,Canada,38,230.341985,2020-10-18 04:24:25,53.760900,-98.813900,Manitoba,1553,
4,92.0,0.673401,"New Brunswick, Canada",297,Canada,2,38.077265,2020-10-18 04:24:25,46.565300,-66.461900,New Brunswick,203,
5,12.0,1.393728,"Newfoundland and Labrador, Canada",287,Canada,4,55.047807,2020-10-18 04:24:25,53.135500,-57.660400,Newfoundland and Labrador,271,
6,0.0,0.000000,"Northwest Territories,Canada",5,Canada,0,11.134865,2020-10-18 04:24:25,64.825500,-124.845700,Northwest Territories,5,
7,4.0,5.946935,"Nova Scotia, Canada",1093,Canada,65,111.820776,2020-10-18 04:24:25,44.682000,-63.744300,Nova Scotia,1024,
8,6013.0,4.691129,"Ontario, Canada",65869,Canada,3090,447.728212,2020-10-18 04:24:25,51.253800,-85.323200,Ontario,56766,
9,3.0,0.000000,"Prince Edward Island, Canada",63,Canada,0,39.833584,2020-10-18 04:24:25,46.510700,-63.416800,Prince Edward Island,60,


In [7]:
# Canada Cases
casesCanada = frame_filtered.loc[(frame_filtered["Country/Region"] == "Canada")].reset_index(drop = True)

In [8]:
casesCanada

Unnamed: 0.1,Active,Case-Fatality_Ratio,Combined_Key,Confirmed,Country/Region,Deaths,Incidence_Rate,Last Update,Latitude,Longitude,Province/State,Recovered,Unnamed: 0
0,2836.0,1.322618,"Alberta, Canada",21775,Canada,288,493.412183,2020-10-18 04:24:25,53.9333,-116.5765,Alberta,18651,
1,1513.0,2.243275,"British Columbia, Canada",11189,Canada,251,218.923532,2020-10-18 04:24:25,53.7267,-127.6476,British Columbia,9425,
2,0.0,0.000000,"Grand Princess, Canada",13,Canada,0,,2020-10-18 04:24:25,,,Grand Princess,13,
3,1582.0,1.197605,"Manitoba, Canada",3173,Canada,38,230.341985,2020-10-18 04:24:25,53.7609,-98.8139,Manitoba,1553,
4,92.0,0.673401,"New Brunswick, Canada",297,Canada,2,38.077265,2020-10-18 04:24:25,46.5653,-66.4619,New Brunswick,203,
5,12.0,1.393728,"Newfoundland and Labrador, Canada",287,Canada,4,55.047807,2020-10-18 04:24:25,53.1355,-57.6604,Newfoundland and Labrador,271,
6,0.0,0.000000,"Northwest Territories,Canada",5,Canada,0,11.134865,2020-10-18 04:24:25,64.8255,-124.8457,Northwest Territories,5,
7,4.0,5.946935,"Nova Scotia, Canada",1093,Canada,65,111.820776,2020-10-18 04:24:25,44.6820,-63.7443,Nova Scotia,1024,
8,6013.0,4.691129,"Ontario, Canada",65869,Canada,3090,447.728212,2020-10-18 04:24:25,51.2538,-85.3232,Ontario,56766,
9,3.0,0.000000,"Prince Edward Island, Canada",63,Canada,0,39.833584,2020-10-18 04:24:25,46.5107,-63.4168,Prince Edward Island,60,


In [None]:
# USA Cases 
casesUS = frame_filtered.loc[(frame_filtered["Country/Region"] == "US")].reset_index(drop = True)

In [None]:
casesUS

Unnamed: 0.1,Active,Case-Fatality_Ratio,Combined_Key,Confirmed,Country/Region,Deaths,Incidence_Rate,Last Update,Latitude,Longitude,Province/State,Recovered,Unnamed: 0
0,1955.0,1.412002,"Autauga, Alabama, US",1983,US,28,3549.374429,2020-10-18 04:24:25,32.539527,-86.644082,Alabama,0,
1,6283.0,1.055118,"Baldwin, Alabama, US",6350,US,67,2844.548769,2020-10-18 04:24:25,30.727750,-87.722071,Alabama,0,
2,968.0,0.921187,"Barbour, Alabama, US",977,US,9,3957.708823,2020-10-18 04:24:25,31.868263,-85.387129,Alabama,0,
3,762.0,1.677419,"Bibb, Alabama, US",775,US,13,3460.748415,2020-10-18 04:24:25,32.996421,-87.125115,Alabama,0,
4,1784.0,1.272828,"Blount, Alabama, US",1807,US,23,3124.891917,2020-10-18 04:24:25,33.982109,-86.567906,Alabama,0,
5,617.0,2.681388,"Bullock, Alabama, US",634,US,17,6276.606277,2020-10-18 04:24:25,32.100305,-85.712655,Alabama,0,
6,950.0,4.040404,"Butler, Alabama, US",990,US,40,5090.497738,2020-10-18 04:24:25,31.753001,-86.680575,Alabama,0,
7,4007.0,1.451058,"Calhoun, Alabama, US",4066,US,59,3579.067823,2020-10-18 04:24:25,33.774837,-85.826304,Alabama,0,
8,1223.0,3.396524,"Chambers, Alabama, US",1266,US,43,3807.060805,2020-10-18 04:24:25,32.913601,-85.390727,Alabama,0,
9,699.0,1.963534,"Cherokee, Alabama, US",713,US,14,2721.789586,2020-10-18 04:24:25,34.178060,-85.606390,Alabama,0,


In [None]:
casesNA = frame_filtered

In [None]:
# Convert dataframes into CSVs
casesNA.to_csv('casesNA.csv', index = False, compression ='gzip')
casesUS.to_csv('casesUS.csv', index = False, compression ='gzip')
casesCanada.to_csv('casesCanada.csv', index = False, compression ='gzip')