In [219]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sqlalchemy import create_engine
import psycopg2
from config import aws_psw

In [220]:
db_string = f"postgresql://postgres:{aws_psw}@capstone.c9x4gosspizq.us-east-2.rds.amazonaws.com:5432/Flight_delays"
engine = create_engine(db_string)

In [221]:
delays_df = pd.read_sql('''SELECT * FROM flight_delays''', engine)
weekdays_df =  pd.read_sql('''SELECT * FROM weekdays''', engine)
airports_df =  pd.read_sql('''SELECT * FROM airports''', engine)

In [222]:
weekdays_map = {
    d['weekday'].lower(): d['code']
    for d in weekdays_df.to_dict('records')
    }
airports_map = {
    d['airport_code']:{
        'airport_id':d['airport_id'],
        'city_name': d['city_name']
        } for d in airports_df.to_dict('records')
}

In [240]:
delays_df['day_of_week'].value_counts()

1    129704
5    129047
7    128658
4    127146
2    117617
3    116210
6    114192
Name: day_of_week, dtype: int64

In [223]:
delays_df['dep_time'].value_counts()

0      8599
655    2326
656    2168
657    2106
658    2080
       ... 
320       1
359       1
417       1
253       1
344       1
Name: dep_time, Length: 1368, dtype: int64

In [224]:
delays_df['arr_time'].value_counts()

0       8978
1215    1210
1211    1187
1210    1166
1221    1159
        ... 
316        1
307        1
331        1
335        1
323        1
Name: arr_time, Length: 1441, dtype: int64

In [225]:
for i,row in delays_df.iterrows():
        dep = row['dep_time']
        arr = row['arr_time']
        if dep < 800:
            row['dep_time'] = 1
        elif dep < 1100:
            row['dep_time'] = 2
        elif dep < 1400:
            row['dep_time'] = 3
        elif dep < 1700:
            row['dep_time'] = 4
        elif dep < 2000:
            row['dep_time'] = 5
        else:
            row['dep_time'] = 6
        
        if arr < 800:
            row['arr_time'] = 1
        elif arr < 1100:
            row['arr_time'] = 2
        elif arr < 1400:
            row['arr_time'] = 3
        elif arr < 1700:
            row['arr_time'] = 4
        elif arr < 2000:
            row['arr_time'] = 5
        else:
            row['arr_time'] = 6
            
for i, row in delays_df.iterrows():
    if row['arr_delay_group'] < 0:
        row['arr_delay_group'] = 0
    elif 0 <= row['arr_delay_group'] < 2:
        row['arr_delay_group'] = 1
    else:
        row['arr_delay_group'] = 2    

In [226]:
delays_df['day_of_week'].value_counts()

1    129704
5    129047
7    128658
4    127146
2    117617
3    116210
6    114192
Name: day_of_week, dtype: int64

In [227]:
delays_df['dep_time'].value_counts()

2    184898
3    162536
5    150179
1    146303
4    139113
6     79545
Name: dep_time, dtype: int64

In [228]:
delays_df['arr_time'].value_counts()

6    176806
3    169435
5    159927
4    152690
2    136797
1     66919
Name: arr_time, dtype: int64

In [229]:
delays_df['arr_delay_group'].value_counts()

-1     354503
-2     246873
 0     151544
 1      45918
 2      20565
 3      11862
 4       7599
 5       5360
 12      5205
 6       3861
 7       2909
 8       2232
 9       1772
 10      1346
 11      1025
Name: arr_delay_group, dtype: int64

In [230]:
for i, row in delays_df.iterrows():
    if row['arr_delay_group'] < 0:
        row['arr_delay_group'] = 0
    elif 0 <= row['arr_delay_group'] < 2:
        row['arr_delay_group'] = 1
    else:
        row['arr_delay_group'] = 2

In [231]:
delays_df['arr_delay_group'].value_counts()

0    601376
1    197462
2     63736
Name: arr_delay_group, dtype: int64

In [232]:
y = delays_df['arr_delay_group']
X = delays_df.drop(columns = 'arr_delay_group')

In [233]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    random_state=1)

In [234]:
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [235]:
rfc = RandomForestClassifier(n_estimators=128, random_state=1)
rfc.fit(X_train_scaled, y_train)

RandomForestClassifier(n_estimators=128, random_state=78)

In [236]:
pred = rfc.predict(X_test_scaled)

In [237]:
results = pd.DataFrame({"Prediction": pred, "Actual": y_test}).reset_index(drop=True)

In [238]:
print(rfc.score(X_train_scaled, y_train))
print(rfc.score(X_test_scaled, y_test))

0.7347394617655697
0.7110422733764908


In [217]:
importances = rfc.feature_importances_
sorted(zip(rfc.feature_importances_, X.columns), reverse=True)

[(0.3658245619441803, 'dest_airport_id'),
 (0.32721640378518896, 'origin_airport_id'),
 (0.17712036725679062, 'day_of_week'),
 (0.06519511962962768, 'arr_time'),
 (0.06464354738421256, 'dep_time')]

In [218]:
rfc.predict([[1,14831, 14869, 1010, 1302]])

array([2])