In [71]:
#import dependencies
import pandas as pd
from datetime import datetime

In [72]:
#call up tsa data
tsa_data="Resources/tsa_data.csv"
tsa = pd.read_csv(tsa_data)
#change date formatting for consistency
tsa['Date']=pd.to_datetime(tsa['Date'].astype(str), format='%m/%d/%Y')
#changed numbers into floats
tsa["Total Traveler Throughput 2020"] = tsa["Total Traveler Throughput 2020"].str.replace(',','')
tsa["Total Traveler Throughput 2020"] = tsa["Total Traveler Throughput 2020"].astype(float)
tsa["Total Traveler Throughput 2019"] = tsa["Total Traveler Throughput 2019"].str.replace(',','')
tsa["Total Traveler Throughput 2019"] = tsa["Total Traveler Throughput 2019"].astype(float)
tsa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 276 entries, 0 to 275
Data columns (total 3 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   Date                            276 non-null    datetime64[ns]
 1   Total Traveler Throughput 2020  276 non-null    float64       
 2   Total Traveler Throughput 2019  276 non-null    float64       
dtypes: datetime64[ns](1), float64(2)
memory usage: 6.6 KB


In [73]:
#prepare tsa file for merge with NYT data
passenger_numbers_2020 = tsa.loc[:,["Date","Total Traveler Throughput 2020"]]
passenger_numbers_2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 276 entries, 0 to 275
Data columns (total 2 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   Date                            276 non-null    datetime64[ns]
 1   Total Traveler Throughput 2020  276 non-null    float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 4.4 KB


In [74]:
#change date to number string
numbered_tsa = passenger_numbers_2020
numbered_tsa["Date"]=numbered_tsa["Date"].astype(str)
numbered_tsa['Date']=numbered_tsa['Date'].str.replace('-','')
numbered_tsa['Date']=numbered_tsa['Date'].astype(float)
#limited dates to match with those from nyt dataset
numbered_tsa = numbered_tsa.loc[numbered_tsa["Date"]<20200729]
numbered_tsa

Unnamed: 0,Date,Total Traveler Throughput 2020
0,20200301.0,2280522.0
1,20200302.0,2089641.0
2,20200303.0,1736393.0
3,20200304.0,1877401.0
4,20200305.0,2130015.0
...,...,...
145,20200724.0,724770.0
146,20200725.0,649027.0
147,20200726.0,751205.0
148,20200727.0,700043.0


In [75]:
#load and setup NYT data into dataframe
nyt_data="Resources/covid_19_state_level_data.csv"
nyt = pd.read_csv(nyt_data)
    

In [76]:
#edit nyt data to only include relevant info
curated_nyt = nyt.loc[:,["date","state","cases"]]
curated_nyt = curated_nyt.rename(columns={"date":"Date"})
curated_nyt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8154 entries, 0 to 8153
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Date    8154 non-null   object
 1   state   8154 non-null   object
 2   cases   8154 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 191.2+ KB


In [77]:
#change date into float
numbered_nyt = curated_nyt
numbered_nyt['Date']=numbered_nyt['Date'].str.replace('-','')
numbered_nyt['Date']=numbered_nyt['Date'].astype(float)
numbered_nyt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8154 entries, 0 to 8153
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    8154 non-null   float64
 1   state   8154 non-null   object 
 2   cases   8154 non-null   int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 191.2+ KB


In [78]:
#set up nyt dates to match tsa dates
tsa_numbered_nyt = numbered_nyt.loc[numbered_nyt["Date"]>20200230]
tsa_numbered_nyt.head()

Unnamed: 0,Date,state,cases
240,20200301.0,Arizona,1
241,20200301.0,California,33
242,20200301.0,Florida,2
243,20200301.0,Illinois,3
244,20200301.0,Massachusetts,1


In [69]:
#group by state to get total case number for comparison and make new column of sums
tsa_numbered_nyt["Total Cases"] = tsa_numbered_nyt.groupby(["Date"])["cases"].transform("sum")
#make a new dataframe of just date and total cases and drop duplicates
nyt_nationwide = tsa_numbered_nyt.loc[:,["Date","Total Cases"]]
nyt_nationwide = nyt_nationwide.drop_duplicates()
#Make a new column of rate of change in total cases using .diff
nyt_nationwide["Case Rate of Change"]= nyt_nationwide["Total Cases"].diff()
nyt_nationwide

Unnamed: 0,Date,Total Cases,Case Rate of Change
240,20200301.0,88,
253,20200302.0,104,16.0
268,20200303.0,125,21.0
284,20200304.0,161,36.0
301,20200305.0,228,67.0
...,...,...,...
7879,20200724.0,4123651,73525.0
7934,20200725.0,4190422,66771.0
7989,20200726.0,4244634,54212.0
8044,20200727.0,4303813,59179.0


In [49]:
airport_data = "Resources/covid_impact_on_airport_traffic.csv"
airport = pd.read_csv(airport_data)
airport.head()

Unnamed: 0,AggregationMethod,Date,Version,AirportName,PercentOfBaseline,Centroid,City,State,ISO_3166_2,Country,Geography
0,Daily,2020-07-05,1.0,Kingsford Smith,52,POINT(151.180087713813 -33.9459774986125),Sydney,New South Wales,AU,Australia,"POLYGON((151.164354085922 -33.9301772341877, 1..."
1,Daily,2020-05-28,1.0,Kingsford Smith,61,POINT(151.180087713813 -33.9459774986125),Sydney,New South Wales,AU,Australia,"POLYGON((151.164354085922 -33.9301772341877, 1..."
2,Daily,2020-05-07,1.0,Kingsford Smith,62,POINT(151.180087713813 -33.9459774986125),Sydney,New South Wales,AU,Australia,"POLYGON((151.164354085922 -33.9301772341877, 1..."
3,Daily,2020-06-24,1.0,Kingsford Smith,58,POINT(151.180087713813 -33.9459774986125),Sydney,New South Wales,AU,Australia,"POLYGON((151.164354085922 -33.9301772341877, 1..."
4,Daily,2020-08-05,1.0,Kingsford Smith,20,POINT(151.180087713813 -33.9459774986125),Sydney,New South Wales,AU,Australia,"POLYGON((151.164354085922 -33.9301772341877, 1..."


In [50]:
#remove not relevant columns from airport data
us_airport_data = airport[airport["Country"]=="United States of America (the)"]
us_airport_data.head()

Unnamed: 0,AggregationMethod,Date,Version,AirportName,PercentOfBaseline,Centroid,City,State,ISO_3166_2,Country,Geography
2294,Daily,2020-05-06,1.0,Los Angeles International,66,POINT(-118.404993180627 33.941369379328),Los Angeles,California,US-CA,United States of America (the),"POLYGON((-118.439612388611 33.9517616366508, -..."
2295,Daily,2020-07-04,1.0,Los Angeles International,36,POINT(-118.404993180627 33.941369379328),Los Angeles,California,US-CA,United States of America (the),"POLYGON((-118.439612388611 33.9517616366508, -..."
2296,Daily,2020-09-24,1.0,Los Angeles International,78,POINT(-118.404993180627 33.941369379328),Los Angeles,California,US-CA,United States of America (the),"POLYGON((-118.439612388611 33.9517616366508, -..."
2297,Daily,2020-10-05,1.0,Los Angeles International,78,POINT(-118.404993180627 33.941369379328),Los Angeles,California,US-CA,United States of America (the),"POLYGON((-118.439612388611 33.9517616366508, -..."
2298,Daily,2020-07-30,1.0,Los Angeles International,70,POINT(-118.404993180627 33.941369379328),Los Angeles,California,US-CA,United States of America (the),"POLYGON((-118.439612388611 33.9517616366508, -..."


In [51]:
#continue removing not relevant columns
curated_airport_data = us_airport_data.loc[:,["Date","AirportName","State","PercentOfBaseline"]]
curated_airport_data.head()

Unnamed: 0,Date,AirportName,State,PercentOfBaseline
2294,2020-05-06,Los Angeles International,California,66
2295,2020-07-04,Los Angeles International,California,36
2296,2020-09-24,Los Angeles International,California,78
2297,2020-10-05,Los Angeles International,California,78
2298,2020-07-30,Los Angeles International,California,70
