In [2]:
#import dependencies
import pandas as pd
from datetime import datetime
import datetime as dt
import matplotlib.pyplot as plt

In [3]:
#call up tsa data
tsa_data="Resources/tsa_data.csv"
tsa = pd.read_csv(tsa_data)
#change date formatting for consistency
tsa['Date']=pd.to_datetime(tsa['Date'].astype(str), format='%m/%d/%Y')
#changed numbers into floats
tsa["Total Traveler Throughput 2020"] = tsa["Total Traveler Throughput 2020"].str.replace(',','')
tsa["Total Traveler Throughput 2020"] = tsa["Total Traveler Throughput 2020"].astype(float)
tsa["Total Traveler Throughput 2019"] = tsa["Total Traveler Throughput 2019"].str.replace(',','')
tsa["Total Traveler Throughput 2019"] = tsa["Total Traveler Throughput 2019"].astype(float)
#reduce variance by using rolling mean for traveler numbers
tsa['Average 2019'] = tsa.iloc[:,2].rolling(window=7).mean()
tsa['Average 2020'] = tsa.iloc[:,1].rolling(window=7).mean()


In [26]:
tsa.head(10)

Unnamed: 0,Date,Total Traveler Throughput 2020,Total Traveler Throughput 2019,Average 2019,Average 2020
0,2020-03-01,2280522.0,2301439.0,,
1,2020-03-02,2089641.0,2257920.0,,
2,2020-03-03,1736393.0,1979558.0,,
3,2020-03-04,1877401.0,2143619.0,,
4,2020-03-05,2130015.0,2402692.0,,
5,2020-03-06,2198517.0,2543689.0,,
6,2020-03-07,1844811.0,2156262.0,2255026.0,2022471.0
7,2020-03-08,2119867.0,2485430.0,2281310.0,1999521.0
8,2020-03-09,1909363.0,2378673.0,2298560.0,1973767.0
9,2020-03-10,1617220.0,2122898.0,2319038.0,1956742.0


In [5]:
#prepare tsa file for merge with NYT data
passenger_numbers_2020 = tsa.loc[:,["Date","Average 2020"]]

In [6]:
#change date to number string
numbered_tsa = passenger_numbers_2020
numbered_tsa["Date"]=numbered_tsa["Date"].astype(str)
numbered_tsa['Date']=numbered_tsa['Date'].str.replace('-','')
numbered_tsa['Date']=numbered_tsa['Date'].astype(float)
#limited dates to match with those from nyt dataset
numbered_tsa = numbered_tsa.loc[numbered_tsa["Date"]<20200729]

In [7]:
#load and setup NYT data into dataframe
nyt_data="Resources/covid_19_state_level_data.csv"
nyt = pd.read_csv(nyt_data) 

In [8]:
#edit nyt data to only include relevant info
curated_nyt = nyt.loc[:,["date","state","cases"]]
curated_nyt = curated_nyt.rename(columns={"date":"Date"})

In [9]:
#change date into float
numbered_nyt = curated_nyt
numbered_nyt["DateFormat"]=numbered_nyt["Date"]
numbered_nyt["DateFormat"]=numbered_nyt["DateFormat"].apply(lambda x:
                                             dt.datetime.strptime(x,'%Y-%m-%d'))
numbered_nyt['Date']=numbered_nyt['Date'].str.replace('-','')
numbered_nyt['Date']=numbered_nyt['Date'].astype(float)

In [10]:
#set up nyt dates to match tsa dates
tsa_numbered_nyt = numbered_nyt.loc[numbered_nyt["Date"]>20200230]

In [11]:
#group by state to get total case number for comparison and make new column of sums
tsa_numbered_nyt["Total Cases"] = tsa_numbered_nyt.groupby(["Date"])["cases"].transform("sum")
#make a new dataframe of just date and total cases and drop duplicates
nyt_nationwide = tsa_numbered_nyt.loc[:,["Date","DateFormat","Total Cases"]]
nyt_nationwide = nyt_nationwide.drop_duplicates()
#Make a new column of rate of change in total cases using .diff
nyt_nationwide["Case Rate of Change"]= nyt_nationwide["Total Cases"].diff()
nyt_nationwide['Averaged Rate of Change'] = nyt_nationwide.iloc[:,3].rolling(window=7).mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [25]:
nyt_nationwide.head(10)

Unnamed: 0,Date,DateFormat,Total Cases,Case Rate of Change,Averaged Rate of Change
240,20200301.0,2020-03-01,88,,
253,20200302.0,2020-03-02,104,16.0,
268,20200303.0,2020-03-03,125,21.0,
284,20200304.0,2020-03-04,161,36.0,
301,20200305.0,2020-03-05,228,67.0,
322,20200306.0,2020-03-06,311,83.0,
350,20200307.0,2020-03-07,428,117.0,
383,20200308.0,2020-03-08,547,119.0,65.571429
418,20200309.0,2020-03-09,748,201.0,92.0
455,20200310.0,2020-03-10,1018,270.0,127.571429


In [12]:
combined_total_data = pd.merge(numbered_tsa,nyt_nationwide, how="left",on="Date")
correlation_total_data = combined_total_data.loc[:,["Average 2020","Averaged Rate of Change"]]
for i in range(-30,31):
    correlation_total_data[i]=correlation_total_data["Averaged Rate of Change"].shift(i)

In [27]:
combined_total_data.head(10)

Unnamed: 0,Date,Average 2020,DateFormat,Total Cases,Case Rate of Change,Averaged Rate of Change
0,20200301.0,,2020-03-01,88,,
1,20200302.0,,2020-03-02,104,16.0,
2,20200303.0,,2020-03-03,125,21.0,
3,20200304.0,,2020-03-04,161,36.0,
4,20200305.0,,2020-03-05,228,67.0,
5,20200306.0,,2020-03-06,311,83.0,
6,20200307.0,2022471.0,2020-03-07,428,117.0,
7,20200308.0,1999521.0,2020-03-08,547,119.0,65.571429
8,20200309.0,1973767.0,2020-03-09,748,201.0,92.0
9,20200310.0,1956742.0,2020-03-10,1018,270.0,127.571429


In [28]:
correlation_total_data.head(10)

Unnamed: 0,Average 2020,Averaged Rate of Change,-30,-29,-28,-27,-26,-25,-24,-23,...,21,22,23,24,25,26,27,28,29,30
0,,,19217.571429,17207.142857,15630.428571,14205.428571,12126.857143,10453.857143,8603.142857,6861.714286,...,,,,,,,,,,
1,,,20974.142857,19217.571429,17207.142857,15630.428571,14205.428571,12126.857143,10453.857143,8603.142857,...,,,,,,,,,,
2,,,22791.142857,20974.142857,19217.571429,17207.142857,15630.428571,14205.428571,12126.857143,10453.857143,...,,,,,,,,,,
3,,,24932.285714,22791.142857,20974.142857,19217.571429,17207.142857,15630.428571,14205.428571,12126.857143,...,,,,,,,,,,
4,,,26937.0,24932.285714,22791.142857,20974.142857,19217.571429,17207.142857,15630.428571,14205.428571,...,,,,,,,,,,
5,,,27950.714286,26937.0,24932.285714,22791.142857,20974.142857,19217.571429,17207.142857,15630.428571,...,,,,,,,,,,
6,2022471.0,,29300.285714,27950.714286,26937.0,24932.285714,22791.142857,20974.142857,19217.571429,17207.142857,...,,,,,,,,,,
7,1999521.0,65.571429,30133.285714,29300.285714,27950.714286,26937.0,24932.285714,22791.142857,20974.142857,19217.571429,...,,,,,,,,,,
8,1973767.0,92.0,30831.857143,30133.285714,29300.285714,27950.714286,26937.0,24932.285714,22791.142857,20974.142857,...,,,,,,,,,,
9,1956742.0,127.571429,31543.571429,30831.857143,30133.285714,29300.285714,27950.714286,26937.0,24932.285714,22791.142857,...,,,,,,,,,,


In [13]:
#make individual state dataframes for Georgia, California, Massachusetts, and Texas
#also create Case rate of change columns for all states
California = curated_nyt.loc[curated_nyt["state"]=="California"]
California["Case Rate of Change"]= California["cases"].diff()
California['Averaged Rate of Change'] = California.iloc[:,4].rolling(window=7).mean()
Georgia = curated_nyt.loc[curated_nyt["state"]=="Georgia"]
Georgia["Case Rate of Change"]= Georgia["cases"].diff()
Georgia['Averaged Rate of Change'] = Georgia.iloc[:,4].rolling(window=7).mean()
Massachusetts = curated_nyt.loc[curated_nyt["state"]=="Massachusetts"]
Massachusetts["Case Rate of Change"]= Massachusetts["cases"].diff()
Massachusetts['Averaged Rate of Change'] = Massachusetts.iloc[:,4].rolling(window=7).mean()
Texas = curated_nyt.loc[curated_nyt["state"]=="Texas"]
Texas["Case Rate of Change"]= Texas["cases"].diff()
Texas['Averaged Rate of Change'] = Texas.iloc[:,4].rolling(window=7).mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docum

In [14]:
#rate of change in case numbers in states is not a good measurement as there are different population sizes and densities
#balance this out with daily case rate of change and total cases proportionate to the population
TX_population = 29087070
CA_population = 39747267
MA_population = 6976600
GA_population = 10736100
Texas["Case Percent"] = Texas["cases"].div(TX_population)
Texas["Case Percent"] = round(Texas["Case Percent"].multiply(100),2)
Texas["Percent Rate of Change"] = Texas["Averaged Rate of Change"].div(TX_population)
Texas["Percent Rate of Change"] = round(Texas["Percent Rate of Change"].multiply(100),4)
California["Case Percent"] = California["cases"].div(CA_population)
California["Case Percent"] = round(California["Case Percent"].multiply(100),2)
California["Percent Rate of Change"] = California["Averaged Rate of Change"].div(CA_population)
California["Percent Rate of Change"] = round(California["Percent Rate of Change"].multiply(100),4)
Massachusetts["Case Percent"] = Massachusetts["cases"].div(MA_population)
Massachusetts["Case Percent"] = round(Massachusetts["Case Percent"].multiply(100),2)
Massachusetts["Percent Rate of Change"] = Massachusetts["Averaged Rate of Change"].div(MA_population)
Massachusetts["Percent Rate of Change"] = round(Massachusetts["Percent Rate of Change"].multiply(100),4)
Georgia["Case Percent"] = Georgia["cases"].div(GA_population)
Georgia["Case Percent"] = round(Georgia["Case Percent"].multiply(100),2)
Georgia["Percent Rate of Change"] = Georgia["Averaged Rate of Change"].div(GA_population)
Georgia["Percent Rate of Change"] = round(Georgia["Percent Rate of Change"].multiply(100),4)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http

In [30]:
California.head(10)

Unnamed: 0,Date,state,cases,DateFormat,Case Rate of Change,Averaged Rate of Change,Case Percent,Percent Rate of Change
5,20200125.0,California,1,2020-01-25,,,0.0,
9,20200126.0,California,2,2020-01-26,1.0,,0.0,
13,20200127.0,California,2,2020-01-27,0.0,,0.0,
17,20200128.0,California,2,2020-01-28,0.0,,0.0,
21,20200129.0,California,2,2020-01-29,0.0,,0.0,
25,20200130.0,California,2,2020-01-30,0.0,,0.0,
29,20200131.0,California,3,2020-01-31,1.0,,0.0,
33,20200201.0,California,3,2020-02-01,0.0,0.285714,0.0,0.0
38,20200202.0,California,6,2020-02-02,3.0,0.571429,0.0,0.0
43,20200203.0,California,6,2020-02-03,0.0,0.571429,0.0,0.0


In [15]:
airport_data = "Resources/covid_impact_on_airport_traffic.csv"
airport = pd.read_csv(airport_data)

In [16]:
#remove not relevant columns from airport data
us_airport_data = airport[airport["Country"]=="United States of America (the)"]

In [17]:
#continue removing not relevant columns
curated_airport_data = us_airport_data.loc[:,["Date","AirportName","State","PercentOfBaseline"]]

In [18]:
#create new column for date with datetime format
curated_airport_data['DateFormat']=curated_airport_data['Date']
curated_airport_data["DateFormat"]=curated_airport_data["DateFormat"].apply(lambda x:
                                             dt.datetime.strptime(x,'%Y-%m-%d'))
#set date column as number
curated_airport_data['Date']=curated_airport_data['Date'].str.replace('-','')
curated_airport_data['Date']=curated_airport_data['Date'].astype(float)

In [19]:
curated_airport_data=curated_airport_data.sort_values(by=["Date"])

In [20]:
#test airport df
LAX = curated_airport_data.loc[curated_airport_data["AirportName"]=="Los Angeles International"]
LAX['Averaged POB'] = LAX.iloc[:,3].rolling(window=7).mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [21]:
#create rest of airport dfs
SFO = curated_airport_data.loc[curated_airport_data["AirportName"]=="San Francisco International"]
SFO['Averaged POB'] = SFO.iloc[:,3].rolling(window=7).mean()
BOS =  curated_airport_data.loc[curated_airport_data["State"]=="Massachusetts"]
BOS['Averaged POB'] = BOS.iloc[:,3].rolling(window=7).mean()
ATL =  curated_airport_data.loc[curated_airport_data["State"]=="Georgia"]
ATL['Averaged POB'] = ATL.iloc[:,3].rolling(window=7).mean()
DFW =  curated_airport_data.loc[curated_airport_data["State"]=="Texas"]
DFW['Averaged POB'] = DFW.iloc[:,3].rolling(window=7).mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = v

In [31]:
LAX.head(10)

Unnamed: 0,Date,AirportName,State,PercentOfBaseline,DateFormat,Averaged POB
2405,20200316.0,Los Angeles International,California,98,2020-03-16,
2416,20200317.0,Los Angeles International,California,99,2020-03-17,
2497,20200318.0,Los Angeles International,California,89,2020-03-18,
2441,20200319.0,Los Angeles International,California,88,2020-03-19,
2329,20200320.0,Los Angeles International,California,84,2020-03-20,
2305,20200321.0,Los Angeles International,California,76,2020-03-21,
2310,20200322.0,Los Angeles International,California,76,2020-03-22,87.142857
2446,20200323.0,Los Angeles International,California,73,2020-03-23,83.571429
2365,20200324.0,Los Angeles International,California,71,2020-03-24,79.571429
2460,20200325.0,Los Angeles International,California,72,2020-03-25,77.142857


In [22]:
#test with 1 airport and city for combined data set
LAXCA = pd.merge(California,LAX,how="inner",on="Date")

In [23]:
#finish rest of airport city merges
SFOCA = pd.merge(California,SFO,how="inner",on="Date")
BOSMA = pd.merge(Massachusetts,BOS,how="inner",on="Date")
ATLGA = pd.merge(Georgia,ATL,how="inner",on="Date")
DFWTX = pd.merge(Texas,DFW,how="inner",on="Date")

In [33]:
LAXCA.head(10)

Unnamed: 0,Date,state,cases,DateFormat_x,Case Rate of Change,Averaged Rate of Change,Case Percent,Percent Rate of Change,AirportName,State,PercentOfBaseline,DateFormat_y,Averaged POB
0,20200316.0,California,588,2020-03-16,110.0,59.428571,0.0,0.0001,Los Angeles International,California,98,2020-03-16,
1,20200317.0,California,732,2020-03-17,144.0,79.0,0.0,0.0002,Los Angeles International,California,99,2020-03-17,
2,20200318.0,California,893,2020-03-18,161.0,98.714286,0.0,0.0002,Los Angeles International,California,89,2020-03-18,
3,20200319.0,California,1067,2020-03-19,174.0,116.428571,0.0,0.0003,Los Angeles International,California,88,2020-03-19,
4,20200320.0,California,1283,2020-03-20,216.0,137.571429,0.0,0.0003,Los Angeles International,California,84,2020-03-20,
5,20200321.0,California,1544,2020-03-21,261.0,166.142857,0.0,0.0004,Los Angeles International,California,76,2020-03-21,
6,20200322.0,California,1851,2020-03-22,307.0,196.142857,0.0,0.0005,Los Angeles International,California,76,2020-03-22,87.142857
7,20200323.0,California,2240,2020-03-23,389.0,236.0,0.01,0.0006,Los Angeles International,California,73,2020-03-23,83.571429
8,20200324.0,California,2644,2020-03-24,404.0,273.142857,0.01,0.0007,Los Angeles International,California,71,2020-03-24,79.571429
9,20200325.0,California,3183,2020-03-25,539.0,327.142857,0.01,0.0008,Los Angeles International,California,72,2020-03-25,77.142857
