### Combine Mobility Data with COVID Cases & Deaths Data

In [55]:
# imports
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

Read in Processed Mobility Data

In [56]:
mobility = "https://raw.githubusercontent.com/ehuang13/w209_final/master/data/US_Mobility_Report_preprocess.csv"
mobility_df = pd.read_csv(mobility, low_memory = False)

In [57]:
print("Mobility data dimensions: {}".format(mobility_df.shape))
mobility_df.sample(5)

Mobility data dimensions: (298720, 15)


Unnamed: 0.1,Unnamed: 0,country_region_code,country_region,state,county,iso_3166_2_code,census_fips_code,date,retail_and_recreation,grocery_and_pharmacy,parks,transit_stations,workplaces,residential,avg_change
144824,144824,US,United States,Missouri,Bollinger County,,29017,2/19/2020,12.5,3.0,10.7,4.0,1.0,-0.714286,5.080952
49340,49340,US,United States,Georgia,Bibb County,,13021,5/6/2020,-12.0,-2.0,62.0,10.0,-37.0,12.0,5.5
267710,267710,US,United States,Virginia,Lynchburg,,51680,3/1/2020,18.0,11.0,8.4375,20.6,4.0,-2.0,10.00625
9169,9169,US,United States,Arizona,Pima County,,4019,4/2/2020,-40.0,-13.0,-23.0,-40.0,-47.0,18.0,-24.166667
142655,142655,US,United States,Mississippi,Oktibbeha County,,28105,5/13/2020,-18.0,-16.0,1.947368,-8.727273,-39.0,7.0,-12.129984


Aggregate Mobility Data by State

In [93]:
# group by state and date
state_df = mobility_df.groupby(["state","date"]).mean()
state_df = state_df.reset_index()

# checkout grouped dataframe
print("Mobility state data dimensions: {}".format(state_df.shape))
state_df.sample(5)                                        

Mobility state data dimensions: (5700, 12)


Unnamed: 0.1,state,date,Unnamed: 0,iso_3166_2_code,census_fips_code,retail_and_recreation,grocery_and_pharmacy,parks,transit_stations,workplaces,residential,avg_change
2104,Maine,4/15/2020,113727.5,,23016.0,-28.072917,-5.95,10.978125,-39.575,-41.125,16.334375,-14.568403
1020,Florida,6/2/2020,37498.0,,12067.910448,-13.955224,-3.141791,-9.507463,-20.447761,-26.447761,9.880597,-10.603234
5556,Wisconsin,5/17/2020,295125.5,,55074.033333,-27.658333,-12.025,-24.916667,-29.641667,-20.375,8.908333,-17.618056
3992,Oklahoma,2/17/2020,201930.0,,40081.776119,0.283582,-2.589552,14.024165,2.712864,-18.716418,4.583582,0.049704
684,Connecticut,2/15/2020,29085.5,,9008.0,4.5,-4.5,10.125,9.0625,-1.5,-0.125,2.927083


In [95]:
# drop unnecessary columns
state_df = state_df.drop(columns=['Unnamed: 0', 'iso_3166_2_code', 'census_fips_code'])
state_df.head(5)

Unnamed: 0,state,date,retail_and_recreation,grocery_and_pharmacy,parks,transit_stations,workplaces,residential,avg_change
0,Alabama,2/15/2020,4.606061,0.166667,50.510101,5.875,0.808081,-0.709596,10.209386
1,Alabama,2/16/2020,-1.446154,-4.146154,-6.471795,1.638034,-1.882051,0.705128,-1.933832
2,Alabama,2/17/2020,-1.447761,-4.320896,12.652985,5.408197,-16.253731,4.376866,0.069277
3,Alabama,2/18/2020,-4.134328,-4.925373,-12.069403,0.513859,0.328358,1.481343,-3.134257
4,Alabama,2/19/2020,1.432836,-0.820896,7.61791,2.641791,1.402985,0.074627,2.058209


Read in COVID Cases & Deaths Data

In [58]:
deaths = "https://raw.githubusercontent.com/ehuang13/w209_final/master/ernesto/covid_death_proccessed.csv"
deaths_df = pd.read_csv(deaths)

In [59]:
print("Deaths dataframe dimensions: {}".format(deaths_df.shape))
deaths_df.sample(5)

Deaths dataframe dimensions: (8772, 5)


Unnamed: 0.1,Unnamed: 0,Date,Deaths_Sum,State,Deaths_Day
3391,123,5/24/20,6372,MA,68.0
7572,4,1/26/20,0,UT,0.0
5631,127,5/28/20,335,NM,6.0
3662,50,3/12/20,0,ME,0.0
3445,5,1/27/20,0,MD,0.0


In [60]:
cases = "https://raw.githubusercontent.com/ehuang13/w209_final/master/ernesto/covid_cases_proccessed.csv"
cases_df = pd.read_csv(cases)

In [61]:
print("Cases dataframe dimensions: {}".format(cases_df.shape))
cases_df.sample(5)

Cases dataframe dimensions: (8772, 5)


Unnamed: 0.1,Unnamed: 0,Date,Cases_Sum,State,Cases_Day
607,91,4/22/20,5459,AZ,208.0
2269,33,2/24/20,0,ID,0.0
4728,84,4/15/20,5099,NC,111.0
4044,88,4/19/20,2347,MN,140.0
858,170,7/10/20,311254,CA,10938.0


Join Cases, Deaths, and Mobility Data on `date` and `state`
1. join covid cases and deaths data on `date` and `state`
2. add in proper state name (match on state abbreviation)
3.  merge covid data with mobility data on `date` and `state`

In [62]:
# join covid cases and deaths data on date and state
covid_df = cases_df.merge(deaths_df, how="left",
                         left_on=["Date", "State"],
                         right_on=["Date", "State"])

In [63]:
# checkout merged covid dataframe
covid_df.sample(5)

Unnamed: 0,Unnamed: 0_x,Date,Cases_Sum,State,Cases_Day,Unnamed: 0_y,Deaths_Sum,Deaths_Day
839,151,6/21/20,177691,CA,3268.0,151,5507,21.0
2766,14,2/5/20,0,KS,0.0,14,0,0.0
1643,95,4/26/20,31527,FL,691.0,95,1074,20.0
2483,75,4/6/20,12258,IL,1005.0,75,307,33.0
3811,27,2/18/20,0,MI,0.0,27,0,0.0


In [64]:
# drop unncessary columns
covid_df = covid_df.drop(columns=['Unnamed: 0_x', 'Unnamed: 0_y'])

# rename columns
covid_df.columns = ["Date", "Total_Cases", "State_Abbrv", "Cases_Day",
                   "Total_Deaths", "Deaths_Day"]

covid_df.sample(5)

Unnamed: 0,Date,Total_Cases,State_Abbrv,Cases_Day,Total_Deaths,Deaths_Day
1811,4/22/20,20493,GA,831.0,841,29.0
4790,6/16/20,45874,NC,782.0,1147,34.0
1282,4/9/20,1523,DC,83.0,22,0.0
4595,5/24/20,483,MT,0.0,18,0.0
2335,4/30/20,2013,ID,31.0,62,2.0


In [65]:
# dictionary of states and abbreviations
states = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}

In [66]:
# add in state column to covid dataframe
State = []
for abbr in covid_df["State_Abbrv"]:
    state = states.get(abbr)
    State.append(state)

# add in State column
covid_df["State"] = State

In [67]:
# checkout updated covid dataframe
print("Covid data dimensions: {}".format(covid_df.shape))
covid_df.sample(5)

Covid data dimensions: (8772, 7)


Unnamed: 0,Date,Total_Cases,State_Abbrv,Cases_Day,Total_Deaths,Deaths_Day,State
1862,6/12/20,56044,GA,901.0,2418,32.0,Georgia
2726,6/16/20,40760,IN,356.0,2265,14.0,Indiana
5745,3/31/20,1113,NV,104.0,26,8.0,Nevada
5941,4/24/20,271659,NY,8130.0,20877,495.0,New York
7077,2/16/20,0,SD,0.0,0,0.0,South Dakota


In [None]:
# merge covid and mobility data on state and date
combined_df = state_df.merge(covid_df, how="right",
                               left_on=["state", "date"],
                               right_on=["Date", "State"])

In [102]:
for i in state_df["date"][:5]:
    print(i)

2/15/2020
2/16/2020
2/17/2020
2/18/2020
2/19/2020


In [103]:
for i in covid_df["Date"][:5]:
    print(i)

1/22/20
1/23/20
1/24/20
1/25/20
1/26/20


In [98]:
print("Combined data dimensions: {}".format(combined_df.shape))
combined_df.sample(5)

Combined data dimensions: (5700, 16)


Unnamed: 0,state,date,retail_and_recreation,grocery_and_pharmacy,parks,transit_stations,workplaces,residential,avg_change,Date,Total_Cases,State_Abbrv,Cases_Day,Total_Deaths,Deaths_Day,State
518,California,4/24/2020,-49.732143,-10.389881,-15.116071,-37.142857,-43.160714,19.196429,-22.724206,,,,,,,
953,Florida,3/5/2020,8.181818,5.848485,2.628788,5.496212,0.598485,0.106061,3.809975,,,,,,,
5220,Virginia,5/22/2020,-18.339695,1.125954,30.439499,-17.718103,-36.122137,16.627045,-3.997906,,,,,,,
2683,Mississippi,4/23/2020,-25.863636,-5.047619,-28.793723,-20.018182,-35.779221,15.785281,-16.619517,,,,,,,
2695,Mississippi,4/7/2020,-38.731768,-10.129371,-31.790584,-23.506494,-38.506494,17.553946,-20.851794,,,,,,,


In [77]:
covid_df.head()

Unnamed: 0,Date,Total_Cases,State_Abbrv,Cases_Day,Total_Deaths,Deaths_Day,State
0,1/22/20,0,AK,0.0,0,0.0,Alaska
1,1/23/20,0,AK,0.0,0,0.0,Alaska
2,1/24/20,0,AK,0.0,0,0.0,Alaska
3,1/25/20,0,AK,0.0,0,0.0,Alaska
4,1/26/20,0,AK,0.0,0,0.0,Alaska


In [79]:
test = covid_df.groupby(["Date", "State"]).sum()
test = test.reset_index()
test

Unnamed: 0,Date,State,Total_Cases,Cases_Day,Total_Deaths,Deaths_Day
0,1/22/20,Alabama,0,0.0,0,0.0
1,1/22/20,Alaska,0,0.0,0,0.0
2,1/22/20,Arizona,0,0.0,0,0.0
3,1/22/20,Arkansas,0,0.0,0,0.0
4,1/22/20,California,0,0.0,0,0.0
5,1/22/20,Colorado,0,0.0,0,0.0
6,1/22/20,Connecticut,0,0.0,0,0.0
7,1/22/20,Delaware,0,0.0,0,0.0
8,1/22/20,District of Columbia,0,0.0,0,0.0
9,1/22/20,Florida,0,0.0,0,0.0


In [80]:
covid_df.shape

(8772, 7)

In [81]:
test.shape

(8772, 6)