In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import psycopg2
from config import aws_psw

## Load and Clean Data

In [2]:
# Create Engine
db_string = f"postgresql://postgres:{aws_psw}@capstone.c9x4gosspizq.us-east-2.rds.amazonaws.com:5432/Flight_delays"
engine = create_engine(db_string)

In [3]:
# Load csvs from RDS database and concat to make one large one
months = ['sept','oct', 'nov', 'dec', 'jan', 'feb', 'mar', 'april', 'may', 'june', 'july', 'aug']
data_ls = []
for m in months:
    df = pd.read_csv(f'https://ellenbrafferty-bucket.s3.amazonaws.com/capstone_data/flights_{m}.csv')
    data_ls.append(df)
    
year_fl_df = pd.concat(data_ls, axis = 0)

In [4]:
# clean flight data
ren_cols = {'OP_CARRIER_FL_NUM': 'FLIGHT_NUM', 'CRS_DEP_TIME': 'DEP_TIME', 'CRS_ARR_TIME': 'ARR_TIME'}
year_fl_df.rename(columns = ren_cols, inplace = True)
year_fl_df.replace(np.nan, 0, inplace = True)

bad_cols =[col for col in year_fl_df.columns if 'Unnamed' in col]
year_fl_df.drop(columns = bad_cols, inplace = True)
year_fl_df.columns = year_fl_df.columns.str.lower()
final_flights = year_fl_df.copy()

In [5]:
# Load airport lookup table and clean
airports_df = pd.read_csv(f'https://ellenbrafferty-bucket.s3.amazonaws.com/capstone_data/airport_ids.csv')
bad_cols =[col for col in airports_df.columns if 'Unnamed' in col]
ren_cols = {'ORIGIN_AIRPORT_ID': 'AIRPORT_ID', 'ORIGIN':"AIRPORT_CODE", 'ORIGIN_CITY_NAME': 'CITY_NAME'}
airports_df.rename(columns = ren_cols, inplace = True)
airports_df.drop(columns = bad_cols, inplace = True)
airports_df.drop_duplicates(inplace = True, ignore_index = True)
airports_df.columns = airports_df.columns.str.lower()

In [6]:
# Load weekday lookup table and clean
weekdays_df = pd.read_csv(f'https://ellenbrafferty-bucket.s3.amazonaws.com/capstone_data/weekday_ids.csv')
bad_cols =[col for col in weekdays_df.columns if 'Unnamed' in col]
ren_cols = {'Description': 'Weekday'}
weekdays_df.rename(columns = ren_cols, inplace = True)
weekdays_df.drop(columns = bad_cols, inplace = True)
weekdays_df.columns = weekdays_df.columns.str.lower()

## Load Tables to Database

In [7]:
weekdays_df.to_sql(name='weekdays', con=engine, if_exists = 'replace', index = False)

In [8]:
airports_df.to_sql(name='airports', con=engine, if_exists = 'replace', index = False)

In [9]:
final_flights.to_sql(name='flight_delays', con=engine, if_exists = 'replace', index = False)