In [17]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sqlalchemy import create_engine
import psycopg2
from config import aws_psw

In [2]:
db_string = f"postgresql://postgres:{aws_psw}@capstone.c9x4gosspizq.us-east-2.rds.amazonaws.com:5432/Flight_delays"
engine = create_engine(db_string)

In [36]:
delays_df = pd.read_sql('''SELECT * FROM flight_delays''', engine)
weekdays_df =  pd.read_sql('''SELECT * FROM weekdays''', engine)
airports_df =  pd.read_sql('''SELECT * FROM airports''', engine)

In [37]:
weekdays_map = {}
airports_map = {}
for i, row in airports_df.iterrows():
    airport_id = row['airport_id']
    city_name = row['city_name']
    code = row['airport_code']
    if airport_id not in airports_map.keys():
        airports_map[airport_id] = {'air_code': code, 'city': city_name}
    
for i, row in weekdays_df.iterrows():
    code = row['code']
    day = row['weekday']
    if code not in weekdays_map.keys():
        weekdays_map[code] = day

In [39]:
delays_df.drop(columns = ['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay', 'distance', 'air_time'], inplace = True)

In [40]:
y = delays_df['arr_del15']
X = delays_df.drop(columns = 'arr_del15')

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=78)

In [42]:
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [43]:
rfc = RandomForestClassifier(n_estimators=128, random_state=78)
rfc.fit(X_train_scaled, y_train)

RandomForestClassifier(n_estimators=128, random_state=78)

In [44]:
pred = rfc.predict(X_test_scaled)

In [45]:
results = pd.DataFrame({"Prediction": pred, "Actual": y_test}).reset_index(drop=True)

In [46]:
acc_score = accuracy_score(y_test,pred)
acc_score

0.8506288141566656

In [47]:
print(rfc.score(X_train_scaled, y_train))
print(rfc.score(X_test_scaled, y_test))

0.8958388079081199
0.8506288141566656


In [48]:
importances = rfc.feature_importances_
sorted(zip(rfc.feature_importances_, X.columns), reverse=True)

[(0.33625060655033034, 'day_of_week'),
 (0.21517710181264801, 'arr_time'),
 (0.17057283565813242, 'dep_time'),
 (0.1510045765856965, 'flight_num'),
 (0.06354035601485791, 'origin_airport_id'),
 (0.0634545233783349, 'dest_airport_id')]