In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys, os
import glob

In [2]:
# READ ME:
# Before running, make sure casesNA.csv, casesCanada.csv and casesUSA.csv are not in the directory.
# If they are, delete then proceed to run the notebook.

In [3]:
# Take all 217 csv files and merge it into one big dataframe
# Eventually we would want to split the dataframe into three csvs - 
# canada, usa, and north america

In [4]:
# Retrieved from: https://stackoverflow.com/questions/58274401/importing-multiple-csv-files-into-pandas-and-merge-them-into-one-dataframe
path = os.getcwd() # Find path of current working directory
all_files = glob.glob(path + "/*.csv")
dfs = list()

# Add all data files into one list and merge into one big df
for file in all_files:
    df = pd.read_csv(file)
    
    # Fix columns with different names but same content
    # i.e. Country_Region -> Country/Region
    if set(['FIPS']).issubset(df.columns):
        df = df.drop(['FIPS'], axis = 1)
    if 'Admin2' in df:
        df = df.rename(columns = {'Admin2': 'County'}) 
    if 'Province_State' in df:
        df = df.rename(columns = {'Province_State': 'Province/State'})
    if 'Country_Region' in df:
        df = df.rename(columns = {'Country_Region': 'Country/Region'})
    if 'Last_Update' in df:
        df = df.rename(columns = {'Last_Update': 'Last Update'})
    if 'Lat' in df:
        df = df.rename(columns = {'Lat': 'Latitude'})
    if 'Long_' in df:
        df = df.rename(columns = {'Long_': 'Longitude'})
        
    # Add Date column
    date = file.strip(path)
    date = date.strip('.')
    df['Date'] = date
    dfs.append(df)

frame = pd.concat(dfs, ignore_index = True, axis = 0)
frame["Last Update"] = pd.to_datetime(frame["Last Update"])
frame.sort_values(by = "Last Update")

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0,Active,Case-Fatality_Ratio,Combined_Key,Confirmed,Country/Region,County,Date,Deaths,Incidence_Rate,Last Update,Latitude,Longitude,Province/State,Recovered
30618,0.0,,"Qinghai, China",18,China,,03-30-2020,0,,2020-02-23 11:19:00,35.745200,95.995600,Qinghai,18
66529,0.0,,"Tibet, China",1,China,,04-12-2020,0,,2020-02-23 11:19:00,31.692700,88.092400,Tibet,1
66510,0.0,,"Qinghai, China",18,China,,04-12-2020,0,,2020-02-23 11:19:00,35.745200,95.995600,Qinghai,18
43402,0.0,,"Qinghai, China",18,China,,04-04-2020,0,,2020-02-23 11:19:00,35.745200,95.995600,Qinghai,18
43420,0.0,,"Tibet, China",1,China,,04-04-2020,0,,2020-02-23 11:19:00,31.692700,88.092400,Tibet,1
30636,0.0,,"Tibet, China",1,China,,03-30-2020,0,,2020-02-23 11:19:00,31.692700,88.092400,Tibet,1
38118,0.0,,"Tibet, China",1,China,,04-02-2020,0,,2020-02-23 11:19:00,31.692700,88.092400,Tibet,1
48988,0.0,,"Tibet, China",1,China,,04-06-2020,0,,2020-02-23 11:19:00,31.692700,88.092400,Tibet,1
48969,0.0,,"Qinghai, China",18,China,,04-06-2020,0,,2020-02-23 11:19:00,35.745200,95.995600,Qinghai,18
27198,0.0,,"Tibet, China",1,China,,03-29-2020,0,,2020-02-23 11:19:00,31.692700,88.092400,Tibet,1


In [5]:
# Unfiltered df
frame

Unnamed: 0,Active,Case-Fatality_Ratio,Combined_Key,Confirmed,Country/Region,County,Date,Deaths,Incidence_Rate,Last Update,Latitude,Longitude,Province/State,Recovered
0,0.0,,"New York City, New York, US",9654,US,New York City,03-22-2020,63,,2020-03-22 23:45:00,40.767273,-73.971526,New York,0
1,0.0,,"Nassau, New York, US",1900,US,Nassau,03-22-2020,4,,2020-03-22 23:45:00,40.740665,-73.589419,New York,0
2,0.0,,"Westchester, New York, US",1873,US,Westchester,03-22-2020,0,,2020-03-22 23:45:00,41.162784,-73.757417,New York,0
3,0.0,,"Suffolk, New York, US",1034,US,Suffolk,03-22-2020,9,,2020-03-22 23:45:00,40.883201,-72.801217,New York,0
4,0.0,,"Rockland, New York, US",455,US,Rockland,03-22-2020,1,,2020-03-22 23:45:00,41.150279,-74.025605,New York,0
5,0.0,,"Orange, New York, US",247,US,Orange,03-22-2020,0,,2020-03-22 23:45:00,41.403375,-74.302408,New York,0
6,0.0,,"Albany, New York, US",123,US,Albany,03-22-2020,0,,2020-03-22 23:45:00,42.600603,-73.977239,New York,0
7,0.0,,"Dutchess, New York, US",82,US,Dutchess,03-22-2020,0,,2020-03-22 23:45:00,41.764861,-73.743567,New York,0
8,0.0,,"Monroe, New York, US",68,US,Monroe,03-22-2020,1,,2020-03-22 23:45:00,43.146389,-77.693229,New York,0
9,0.0,,"Erie, New York, US",56,US,Erie,03-22-2020,0,,2020-03-22 23:45:00,42.762490,-78.730637,New York,0


In [6]:
# North America Cases - US/NA, not counting the Diamond Princess Cruise cases
# Omit Grand Princess cruise cases?
frame_filtered = frame.loc[(frame["Country/Region"] == "US") | (frame["Country/Region"] == "Canada")]
frame_filtered = frame_filtered.loc[(frame_filtered["Province/State"] != "Diamond Princess") &
                                    (frame_filtered["Province/State"] != "Grand Princess") & 
                                    (frame_filtered["Province/State"] != "Puerto Rico")].reset_index(drop = True)

# Drop unnessesary columns
frame_filtered = frame_filtered.drop(columns = ["Last Update"])

In [7]:
frame_filtered

Unnamed: 0,Active,Case-Fatality_Ratio,Combined_Key,Confirmed,Country/Region,County,Date,Deaths,Incidence_Rate,Latitude,Longitude,Province/State,Recovered
0,0.0,,"New York City, New York, US",9654,US,New York City,03-22-2020,63,,40.767273,-73.971526,New York,0
1,0.0,,"Nassau, New York, US",1900,US,Nassau,03-22-2020,4,,40.740665,-73.589419,New York,0
2,0.0,,"Westchester, New York, US",1873,US,Westchester,03-22-2020,0,,41.162784,-73.757417,New York,0
3,0.0,,"Suffolk, New York, US",1034,US,Suffolk,03-22-2020,9,,40.883201,-72.801217,New York,0
4,0.0,,"Rockland, New York, US",455,US,Rockland,03-22-2020,1,,41.150279,-74.025605,New York,0
5,0.0,,"Orange, New York, US",247,US,Orange,03-22-2020,0,,41.403375,-74.302408,New York,0
6,0.0,,"Albany, New York, US",123,US,Albany,03-22-2020,0,,42.600603,-73.977239,New York,0
7,0.0,,"Dutchess, New York, US",82,US,Dutchess,03-22-2020,0,,41.764861,-73.743567,New York,0
8,0.0,,"Monroe, New York, US",68,US,Monroe,03-22-2020,1,,43.146389,-77.693229,New York,0
9,0.0,,"Erie, New York, US",56,US,Erie,03-22-2020,0,,42.762490,-78.730637,New York,0


In [8]:
# Canada Cases
casesCanada = frame_filtered.loc[(frame_filtered["Country/Region"] == "Canada")].reset_index(drop = True)
casesCanada = casesCanada.drop(columns = ["County"])

In [9]:
casesCanada

Unnamed: 0,Active,Case-Fatality_Ratio,Combined_Key,Confirmed,Country/Region,Date,Deaths,Incidence_Rate,Latitude,Longitude,Province/State,Recovered
0,0.0,,"Ontario, Canada",425,Canada,03-22-2020,5,,51.2538,-85.3232,Ontario,0
1,0.0,,"British Columbia, Canada",424,Canada,03-22-2020,10,,53.7267,-127.6476,British Columbia,0
2,0.0,,"Alberta, Canada",259,Canada,03-22-2020,1,,53.9333,-116.5765,Alberta,0
3,0.0,,"Quebec, Canada",219,Canada,03-22-2020,4,,52.9399,-73.5491,Quebec,0
4,0.0,,"Saskatchewan, Canada",52,Canada,03-22-2020,0,,52.9399,-106.4509,Saskatchewan,0
5,0.0,,"Nova Scotia, Canada",28,Canada,03-22-2020,0,,44.6820,-63.7443,Nova Scotia,0
6,0.0,,"Manitoba, Canada",20,Canada,03-22-2020,0,,53.7609,-98.8139,Manitoba,0
7,0.0,,"New Brunswick, Canada",17,Canada,03-22-2020,0,,46.5653,-66.4619,New Brunswick,0
8,0.0,,"Newfoundland and Labrador, Canada",9,Canada,03-22-2020,0,,53.1355,-57.6604,Newfoundland and Labrador,0
9,0.0,,"Prince Edward Island, Canada",3,Canada,03-22-2020,0,,46.5107,-63.4168,Prince Edward Island,0


In [10]:
# USA Cases 
casesUS = frame_filtered.loc[(frame_filtered["Country/Region"] == "US")].reset_index(drop = True)

In [11]:
casesUS

Unnamed: 0,Active,Case-Fatality_Ratio,Combined_Key,Confirmed,Country/Region,County,Date,Deaths,Incidence_Rate,Latitude,Longitude,Province/State,Recovered
0,0.0,,"New York City, New York, US",9654,US,New York City,03-22-2020,63,,40.767273,-73.971526,New York,0
1,0.0,,"Nassau, New York, US",1900,US,Nassau,03-22-2020,4,,40.740665,-73.589419,New York,0
2,0.0,,"Westchester, New York, US",1873,US,Westchester,03-22-2020,0,,41.162784,-73.757417,New York,0
3,0.0,,"Suffolk, New York, US",1034,US,Suffolk,03-22-2020,9,,40.883201,-72.801217,New York,0
4,0.0,,"Rockland, New York, US",455,US,Rockland,03-22-2020,1,,41.150279,-74.025605,New York,0
5,0.0,,"Orange, New York, US",247,US,Orange,03-22-2020,0,,41.403375,-74.302408,New York,0
6,0.0,,"Albany, New York, US",123,US,Albany,03-22-2020,0,,42.600603,-73.977239,New York,0
7,0.0,,"Dutchess, New York, US",82,US,Dutchess,03-22-2020,0,,41.764861,-73.743567,New York,0
8,0.0,,"Monroe, New York, US",68,US,Monroe,03-22-2020,1,,43.146389,-77.693229,New York,0
9,0.0,,"Erie, New York, US",56,US,Erie,03-22-2020,0,,42.762490,-78.730637,New York,0


In [12]:
casesNA = frame_filtered

In [13]:
casesNA

Unnamed: 0,Active,Case-Fatality_Ratio,Combined_Key,Confirmed,Country/Region,County,Date,Deaths,Incidence_Rate,Latitude,Longitude,Province/State,Recovered
0,0.0,,"New York City, New York, US",9654,US,New York City,03-22-2020,63,,40.767273,-73.971526,New York,0
1,0.0,,"Nassau, New York, US",1900,US,Nassau,03-22-2020,4,,40.740665,-73.589419,New York,0
2,0.0,,"Westchester, New York, US",1873,US,Westchester,03-22-2020,0,,41.162784,-73.757417,New York,0
3,0.0,,"Suffolk, New York, US",1034,US,Suffolk,03-22-2020,9,,40.883201,-72.801217,New York,0
4,0.0,,"Rockland, New York, US",455,US,Rockland,03-22-2020,1,,41.150279,-74.025605,New York,0
5,0.0,,"Orange, New York, US",247,US,Orange,03-22-2020,0,,41.403375,-74.302408,New York,0
6,0.0,,"Albany, New York, US",123,US,Albany,03-22-2020,0,,42.600603,-73.977239,New York,0
7,0.0,,"Dutchess, New York, US",82,US,Dutchess,03-22-2020,0,,41.764861,-73.743567,New York,0
8,0.0,,"Monroe, New York, US",68,US,Monroe,03-22-2020,1,,43.146389,-77.693229,New York,0
9,0.0,,"Erie, New York, US",56,US,Erie,03-22-2020,0,,42.762490,-78.730637,New York,0


In [14]:
# Convert dataframes into CSVs
casesNA.to_csv('casesNA.csv', index = False, compression = 'gzip')
casesUS.to_csv('casesUS.csv', index = False, compression = 'gzip')
casesCanada.to_csv('casesCanada.csv', index = False, compression = 'gzip')