In [20]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from sklearn.model_selection import train_test_split

In [21]:
df = pd.read_csv('df_filtered.csv')

In [28]:
my_list = ['WN', 'DL', 'MQ', 'YV', 'PT', 'UA', 'OO', 'EV', 'G7', 'YX', '9E',
       'G4', 'AA', 'CP', 'QX', 'NK', 'AX', 'HA', 'B6', 'OH', 'AS', 'F9',
       'ZW', 'C5', 'EM', 'KS', 'VX']

my_dict = {'carrier_delay':[],
           'weather_delay':[],
           'nas_delay':[],
           'security_delay':[],
           'late_aircraft_delay':[],
           'dep_delay': []
           }

for i in my_list:
    df1 = df.loc[:, ['op_unique_carrier','dep_delay','carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']]
    df2 = df1[df1['op_unique_carrier'] == i].mean()
    my_dict['dep_delay'].append(df2[0])
    my_dict['carrier_delay'].append(df2[1])
    my_dict['weather_delay'].append(df2[2])
    my_dict['nas_delay'].append(df2[3])
    my_dict['security_delay'].append(df2[4])
    my_dict['late_aircraft_delay'].append(df2[5])
    
my_dict

{'carrier_delay': [3.727365467992859,
  3.8649281176863926,
  1.7425742574257426,
  4.975285171102661,
  2.802919708029197,
  3.761437908496732,
  4.858203916272789,
  10.87032967032967,
  8.091286307053942,
  3.2358490566037736,
  3.2622739018087854,
  9.703703703703704,
  4.564059339177343,
  3.3344947735191637,
  1.847457627118644,
  3.289568345323741,
  6.3875,
  2.65625,
  5.428360413589365,
  3.723849372384937,
  3.430513595166163,
  7.009685230024213,
  1.6338028169014085,
  4.96039603960396,
  nan,
  nan,
  0.0],
 'weather_delay': [0.19459321601632237,
  0.6084921430959546,
  0.7564356435643564,
  0.10836501901140684,
  0.42700729927007297,
  0.6606753812636166,
  1.1201890614449697,
  0.18021978021978022,
  0.966804979253112,
  0.6446540880503144,
  0.3733850129198966,
  0.0,
  0.5906945380984491,
  0.11498257839721254,
  0.19322033898305085,
  0.5755395683453237,
  4.129166666666666,
  0.0,
  0.2895125553914328,
  1.1192468619246863,
  0.30966767371601206,
  0.268765133171912

### Take out averages per carrier for different types of delays

In [31]:
carrier_delay = {'WN': 3.727365467992859, 'DL': 3.8649281176863926, 'MQ': 1.7425742574257426, 'YV': 4.975285171102661, 
                'PT': 2.802919708029197, 'UA': 3.761437908496732, 'OO': 4.858203916272789, 'EV': 10.87032967032967, 
                'G7': 8.091286307053942, 'YX': 3.2358490566037736, '9E': 3.2622739018087854, 'G4': 9.703703703703704, 
                'AA': 4.564059339177343, 'CP': 3.3344947735191637, 'QX': 1.847457627118644, 'NK': 3.289568345323741, 
                'AX': 6.3875, 'HA': 2.65625, 'B6': 5.428360413589365, 'OH': 3.723849372384937, 'AS': 3.430513595166163,
                'F9': 7.009685230024213, 'ZW': 3.430513595166163, 'C5': 4.96039603960396, 'EM': 0, 'KS': 0, 'VX': 0.0}



weather_delay = {'WN': 0.19459321601632237, 'DL': 0.6084921430959546, 'MQ': 0.7564356435643564, 'YV': 0.10836501901140684,
               'PT': 0.42700729927007297, 'UA': 0.6606753812636166, '00': 1.1201890614449697, 'EV':0.18021978021978022,
               'G7': 0.966804979253112, 'YX': 0.6446540880503144, '9E': 0.3733850129198966, 'G4': 0.0,
               'AA': 0.5906945380984491, 'CP': 0.11498257839721254, 'QX': 0.19322033898305085, 'NK': 0.5755395683453237,
               'AX': 4.129166666666666, 'HA': 0.0, 'B6': 0.2895125553914328, 'OH': 1.1192468619246863, 'AS': 0.30966767371601206,
               'F9': 0.2687651331719128, 'ZW': 0.176056338028169, 'C5': 4.693069306930693, 'EM': 0, 'KS':0, 'VX':0.0}


nas_delay = {'WN' :1.8834480999744962, 'DL': 2.809762621196924, 'MQ':4.330693069306931, 'YV':2.693916349809886,
             'PT':4.321167883211679, 'UA':3.8649237472766886, '00':2.37272113436867, 'EV':3.024175824175824,
             'G7':1.7593360995850622, 'YX':3.256289308176101, '9E':3.5271317829457365, 'G4':1.4444444444444444,
             'AA':3.4265003371544167, 'CP':1.2299651567944252, 'QX':1.464406779661017, 'NK':7.76978417266187,
             'AX': 3.2708333333333335, 'HA':0.0, 'B6': 5.908419497784343, 'OH': 2.192468619246862, 'AS':4.069486404833837,
             'F9':3.154963680387409, 'ZW': 1.3943661971830985, 'C5':9.554455445544555, 'EM':0, 'KS':0, 'VX':0.75}


security_delay = {'WN': 0.015302218821729151, 'DL': 0.0, 'MQ':0.0, 'YV':0.0,
                  'PT': 0.0, 'UA': 0.009259259259259259, 'OO': 0.0, 'EV':0.0,
                  'G7': 0.0, 'YX': 0.031446540880503145, '9E': 0.0, 'G4': 0.5802469135802469,
                  'AA': 0.01921780175320297, 'CP': 0.0, 'QX': 0.0, 'NK': 0.0,
                  'AX': 0.0, 'HA': 0.0, 'B6': 0.0103397341211226, 'OH':0.0, 'AS':0.1540785498489426,
                  'F9': 0.0, 'ZW':0.0, 'C5':0.0, 'EM':0, 'KS': 0, 'VX':0.0}

late_aircraft_delay = {'WN': 4.490946187197143, 'DL':2.885322634570378, 'MQ':3.6653465346534655, 'YV':8.260456273764259,
                       'PT':5.142335766423358, 'UA':6.215686274509804, 'OO':5.638082376772451, 'EV':5.367032967032967,
                       'G7':5.668049792531121, 'YX':4.897798742138365, '9E':4.640826873385013, 'G4':2.197530864197531,
                       'AA':5.848617666891436, 'CP':3.9686411149825784, 'QX':1.5728813559322035, 'NK':3.264388489208633,
                       'AX':13.533333333333333, 'HA':0.0, 'B6':7.327917282127031, 'OH':6.688284518828452, 'AS':3.8716012084592144,
                       'F9':8.467312348668282, 'ZW':4.47887323943662, 'C5':15.574257425742575, 'EM':0, 'KS':0, 'VX':10.75}


dep_delay = {'WN':11.87554195358327, 'DL':8.658308258107656, 'MQ':4.9504950495049505, 'YV':13.098859315589353,
             'PT': 7.390510948905109, 'UA':11.708605664488017, 'OO':10.525995948683322, 'EV':16.334065934065933,
             'G7': 14.282157676348548, 'YX': 7.570754716981132, '9E':7.760981912144703, 'G4':11.765432098765432,
             'AA': 12.260283209710048, 'CP':5.7282229965156795, 'QX':1.806779661016949, 'NK':12.226618705035971,
             'AX':22.5375, 'HA':-0.875, 'B6': 16.04431314623338, 'OH': 10.828451882845188, 'AS':7.329305135951661,
             'F9': 17.406779661016948, 'ZW':3.2816901408450705, 'C5':28.06930693069307, 'EM':0, 'KS':0, 'VX':9.5}

### Add different types of carrier delays in dataframe

In [34]:
df['avg_carrier_delays'] = 0

for word in carrier_delay.keys():
    df.loc[df['op_unique_carrier'].str.contains(word, na=False), 
                   ['avg_carrier_delays']] = carrier_delay[word]

In [36]:
df['avg_weather_delays'] = 0

for word in weather_delay.keys():
    df.loc[df['op_unique_carrier'].str.contains(word, na=False), 
                   ['avg_weather_delays']] = weather_delay[word]

In [37]:
df['avg_nas_delays'] = 0

for word in nas_delay.keys():
    df.loc[df['op_unique_carrier'].str.contains(word, na=False), 
                   ['avg_nas_delays']] = nas_delay[word]

In [38]:
df['avg_security_delays'] = 0

for word in security_delay.keys():
    df.loc[df['op_unique_carrier'].str.contains(word, na=False), 
                   ['avg_security_delays']] = security_delay[word]

In [39]:
df['avg_late_aircraft_delays'] = 0

for word in late_aircraft_delay.keys():
    df.loc[df['op_unique_carrier'].str.contains(word, na=False), 
                   ['avg_late_aircraft_delays']] = late_aircraft_delay[word]

In [40]:
df['avg_dep_delays'] = 0

for word in dep_delay.keys():
    df.loc[df['op_unique_carrier'].str.contains(word, na=False), 
                   ['avg_dep_delays']] = dep_delay[word]

### Subset dataset for modeling

In [49]:
df_filtered = df.loc[:, ['origin_city_name', 'arr_delay', 'carrier_name', 'delayed', 'temp_day_list', 'speed_day', 'weather_desc',
                        'day_of_week', 'avg_carrier_delays', 'avg_weather_delays', 'avg_nas_delays', 'avg_security_delays', 
                         'avg_late_aircraft_delays', 'avg_dep_delays']]

### Make X and y

In [393]:
X = df_filtered.loc[:, ['origin_city_name', 'carrier_name', 'delayed', 'temp_day_list', 'speed_day', 'weather_desc',
                        'day_of_week', 'avg_carrier_delays', 'avg_weather_delays', 'avg_nas_delays', 'avg_security_delays', 
                         'avg_late_aircraft_delays', 'avg_dep_delays']]

X.rename(columns={"speed_day": "wind_speed", "temp_day_list": "temp"}, inplace=True)

y = df_filtered.loc[:, ['arr_delay']]

### Split the dataset

In [394]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

### Separate Continuous and Categorical variables

In [395]:
X_train_continuous = X_train.loc[:, ['temp', 'wind_speed', 'avg_carrier_delays', 'avg_weather_delays', 'avg_nas_delays', 
                                     'avg_security_delays', 'avg_late_aircraft_delays', 'avg_dep_delays']]

X_train_categorical = X_train.loc[:, ['origin_city_name', 'carrier_name', 'delayed', 'weather_desc', 'day_of_week']]

### Scale Continuous variables

In [396]:
scaler = StandardScaler()
scaler.fit(X_train_continuous)
scaled_df = scaler.transform(X_train_continuous)

In [397]:
X_train_df = pd.DataFrame(scaled_df)

X_train_df.columns = ['temp', 'wind_speed', 'avg_carrier_delays', 'avg_weather_delays', 'avg_nas_delays', 
                                     'avg_security_delays', 'avg_late_aircraft_delays', 'avg_dep_delays']

print(X_train_categorical.isnull().sum())
print('*********************************')
print(X_train_df.isnull().sum())
print('*********************************')
print(X_train_categorical.shape)
print(X_train_df.shape)

origin_city_name    0
carrier_name        0
delayed             0
weather_desc        0
day_of_week         0
dtype: int64
*********************************
temp                        0
wind_speed                  0
avg_carrier_delays          0
avg_weather_delays          0
avg_nas_delays              0
avg_security_delays         0
avg_late_aircraft_delays    0
avg_dep_delays              0
dtype: int64
*********************************
(14401, 5)
(14401, 8)


### Join Continuous and Scaled variables

In [398]:
X_train_categorical.insert(0, 'New_ID', range(0, 0 + len(X_train_categorical)))
X_train_df.insert(0, 'New_ID', range(0, 0 + len(X_train_df)))

In [399]:
print(X_train_categorical.isnull().sum())
print('*********************************')
print(X_train_df.isnull().sum())
print('*********************************')
print(X_train_categorical.shape)
print(X_train_df.shape)

New_ID              0
origin_city_name    0
carrier_name        0
delayed             0
weather_desc        0
day_of_week         0
dtype: int64
*********************************
New_ID                      0
temp                        0
wind_speed                  0
avg_carrier_delays          0
avg_weather_delays          0
avg_nas_delays              0
avg_security_delays         0
avg_late_aircraft_delays    0
avg_dep_delays              0
dtype: int64
*********************************
(14401, 6)
(14401, 9)


In [400]:
df_final = pd.merge(X_train_categorical, X_train_df, how='inner', on='New_ID')

In [401]:
df_final.drop(['New_ID'], axis=1, inplace=True)

In [402]:
print(df_final.isnull().sum())
print('*********************************')
print(df_final.shape)

origin_city_name            0
carrier_name                0
delayed                     0
weather_desc                0
day_of_week                 0
temp                        0
wind_speed                  0
avg_carrier_delays          0
avg_weather_delays          0
avg_nas_delays              0
avg_security_delays         0
avg_late_aircraft_delays    0
avg_dep_delays              0
dtype: int64
*********************************
(14401, 13)


In [186]:
df_final

Unnamed: 0,origin_city_name,carrier_name,delayed,weather_desc,day_of_week,temp,wind_speed,avg_carrier_delays,avg_weather_delays,avg_nas_delays,avg_security_delays,avg_late_aircraft_delays,avg_dep_delays
0,"Las Vegas, NV",Southwest Airlines,1,Cloudy,Thursday,1.492808,1.682555,-0.339468,-0.533787,-0.701801,0.008946,-0.319400,0.272852
1,"Atlanta, GA",Delta Airlines,0,Sunny,Sunday,-0.004823,0.751634,-0.244683,0.215476,-0.095992,-0.340177,-1.176244,-0.683355
2,"Denver, CO",Southwest Airlines,0,Sunny,Friday,-1.408852,-0.644747,-0.339468,-0.533787,-0.701801,0.008946,-0.319400,0.272852
3,"New York, NY",Republic Airways,0,Cloudy,Tuesday,-0.753638,-1.110208,-0.678135,0.280939,0.196036,0.377282,-0.102282,-1.006591
4,"Minneapolis, MN",Delta Airlines,1,Sunny,Wednesday,0.556789,-0.334440,-0.244683,0.215476,-0.095992,-0.340177,-1.176244,-0.683355
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14396,"Baltimore, MD",Spirit Airlines,0,Cloudy,Friday,0.837595,-0.489594,-0.641121,0.155824,3.147859,-0.340177,-0.973955,0.377197
14397,"Washington, DC",Frontier Airlines,1,Sunny,Wednesday,0.275983,0.441327,1.922137,-0.399517,0.129769,-0.340177,1.802595,1.916812
14398,"Boston, MA",American Airlines,1,Sunny,Thursday,0.275983,0.906788,0.237036,0.183258,0.307354,0.098281,0.405124,0.387202
14399,"Newark, NJ",CommutAir,1,Rain,Sunday,1.024798,0.131020,0.510123,7.609611,4.315033,-0.340177,5.595229,5.085863


### One-hot encode categorical variables

In [403]:
X_train = pd.get_dummies(df_final)

In [404]:
print(X_train.isnull().sum())
print('*********************************')
print(X_train.shape)

delayed                  0
temp                     0
wind_speed               0
avg_carrier_delays       0
avg_weather_delays       0
                        ..
day_of_week_Saturday     0
day_of_week_Sunday       0
day_of_week_Thursday     0
day_of_week_Tuesday      0
day_of_week_Wednesday    0
Length: 76, dtype: int64
*********************************
(14401, 76)


### Convert Train set to numpy array

In [405]:
y_train = np.array(y_train)
y_train = y_train.ravel()
X_train = np.array(X_train)

### Separate 'TEST' set for continuous and categorical variables

In [406]:
X_test_continuous = X_test.loc[:, ['temp', 'wind_speed', 'avg_carrier_delays', 'avg_weather_delays', 'avg_nas_delays', 
                                     'avg_security_delays', 'avg_late_aircraft_delays', 'avg_dep_delays']]

X_test_categorical = X_test.loc[:, ['origin_city_name', 'carrier_name', 'delayed', 'weather_desc', 'day_of_week']]

### Scale Test set

In [407]:
scaler = StandardScaler()
scaler.fit(X_test_continuous)
scaled_df_test = scaler.transform(X_test_continuous)

In [408]:
X_test_df = pd.DataFrame(scaled_df_test)

X_test_df.columns = ['temp', 'wind_speed', 'avg_carrier_delays', 'avg_weather_delays', 'avg_nas_delays', 
                                     'avg_security_delays', 'avg_late_aircraft_delays', 'avg_dep_delays']

print(X_test_categorical.isnull().sum())
print('*********************************')
print(X_test_df.isnull().sum())
print('*********************************')
print(X_test_categorical.shape)
print(X_test_df.shape)

origin_city_name    0
carrier_name        0
delayed             0
weather_desc        0
day_of_week         0
dtype: int64
*********************************
temp                        0
wind_speed                  0
avg_carrier_delays          0
avg_weather_delays          0
avg_nas_delays              0
avg_security_delays         0
avg_late_aircraft_delays    0
avg_dep_delays              0
dtype: int64
*********************************
(6173, 5)
(6173, 8)


### Combine Coninuous and Categorical variables

In [409]:
X_test_categorical.insert(0, 'New_ID', range(0, 0 + len(X_test_categorical)))
X_test_df.insert(0, 'New_ID', range(0, 0 + len(X_test_df)))

In [410]:
print(X_test_categorical.isnull().sum())
print('*********************************')
print(X_test_df.isnull().sum())
print('*********************************')
print(X_test_categorical.shape)
print(X_test_df.shape)

New_ID              0
origin_city_name    0
carrier_name        0
delayed             0
weather_desc        0
day_of_week         0
dtype: int64
*********************************
New_ID                      0
temp                        0
wind_speed                  0
avg_carrier_delays          0
avg_weather_delays          0
avg_nas_delays              0
avg_security_delays         0
avg_late_aircraft_delays    0
avg_dep_delays              0
dtype: int64
*********************************
(6173, 6)
(6173, 9)


In [411]:
df_test = pd.merge(X_test_categorical, X_test_df, how='inner', on='New_ID')

In [412]:
df_test.drop(['New_ID'], axis=1, inplace=True)

In [413]:
print(df_test.isnull().sum())
print('*********************************')
print(df_test.shape)

origin_city_name            0
carrier_name                0
delayed                     0
weather_desc                0
day_of_week                 0
temp                        0
wind_speed                  0
avg_carrier_delays          0
avg_weather_delays          0
avg_nas_delays              0
avg_security_delays         0
avg_late_aircraft_delays    0
avg_dep_delays              0
dtype: int64
*********************************
(6173, 13)


### One Hot Encoding TEST set

In [414]:
X_test = pd.get_dummies(df_test)

In [415]:
print(X_test.isnull().sum())
print('*********************************')
print(X_test.shape)

delayed                  0
temp                     0
wind_speed               0
avg_carrier_delays       0
avg_weather_delays       0
                        ..
day_of_week_Saturday     0
day_of_week_Sunday       0
day_of_week_Thursday     0
day_of_week_Tuesday      0
day_of_week_Wednesday    0
Length: 76, dtype: int64
*********************************
(6173, 76)


### Convert TEST set to numpy array

In [416]:
y_test = np.array(y_test)
y_test = y_test.ravel()
X_test = np.array(X_test)

### Apply Logistic Regression Model

In [215]:
logres = LogisticRegression(max_iter=1000)

In [218]:
penalty = ['l1', 'l2']

In [240]:
C = np.logspace(0,4,10)

In [241]:
params_dict = {'penalty' : ['l1','l2'], 'C' : np.logspace(0,4,10)}

In [242]:
k_folds = KFold(n_splits=5)

In [246]:
grid = GridSearchCV(estimator=logres, param_grid=params_dict, cv=k_folds, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

In [237]:
y_train = y_train.ravel()
X_train = np.array(X_train)

In [247]:
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed: 11.4min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 29.6min finished


Logistic Regression with RMSE

In [245]:
print('Best Score: %s' % grid_result.best_score_)
print('Best Hyperparameters: %s' % grid_result.best_params_)

Best Score: -46.8166981813402
Best Hyperparameters: {'C': 1.0, 'penalty': 'l2'}


Logistic Regression with MSE

In [None]:
grid1 = GridSearchCV(estimator=logres, param_grid=params_dict, cv=k_folds, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

In [None]:
grid_result1 = grid1.fit(X_train, y_train)

In [479]:
print('Best Score: %s' % grid_result.best_score_)
print('Best Hyperparameters: %s' % grid_result.best_params_)

Best Score: -2205.633076574492
Best Hyperparameters: {'C': 1.0, 'penalty': 'l2'}


In [None]:
y_pred = grid_result.predict(X_test)
probabilities = grid_result.predict_proba(X_test)

### Logistic Regression with different C parameter

In [249]:
logres = LogisticRegression(max_iter=1000)

In [482]:
params_dict1 = {'penalty' : ['l1','l2'], 'C' : np.logspace(0,4,10)}

In [251]:
k_folds = KFold(n_splits=5)

In [543]:
grid = GridSearchCV(estimator=logres, param_grid=params_dict1, cv=k_folds, scoring='neg_root_mean_squared_error', verbose=1, n_jobs=-1)

In [551]:
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 32.3min finished


In [542]:
grid_result2 = grid.fit(X_train, y_train)

ValueError: 'neg_mean_squared_root_error' is not a valid scoring value. Use sorted(sklearn.metrics.SCORERS.keys()) to get valid options.

In [255]:
print('Best Score: %s' % grid_result2.best_score_)
print('Best Hyperparameters: %s' % grid_result2.best_params_)

Best Score: -2205.633076574492
Best Hyperparameters: {'C': 1.0, 'penalty': 'l2'}


In [552]:
y_pred = grid_result.predict(X_test)
y_pred1 = grid_result2.predict(X_test)

In [553]:
mean_squared_error(y_test, y_pred)

print(r2_score(y_test, y_pred))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

0.025370205256110556
Mean Absolute Error: 17.779847723959175
Mean Squared Error: 2223.604892272801
Root Mean Squared Error: 47.155115229132896


### BEST RESULT FOR LOGISTIC REGRESSION

In [472]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred1)

print(r2_score(y_test, y_pred1))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred1))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred1))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred1)))

0.0762371797514736
Mean Absolute Error: 17.288514498623037
Mean Squared Error: 2107.5525676332413
Root Mean Squared Error: 45.90808825940415


In [540]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred1)

print(r2_score(y_test, y_pred1))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred1))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred1))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred1)))

0.025370205256110556
Mean Absolute Error: 17.779847723959175
Mean Squared Error: 2223.604892272801
Root Mean Squared Error: 47.155115229132896


### Logistic Regression with different C parameter

In [None]:
params_dict1 = {'penalty' : ['l1','l2'], 'C' : [0.001,0.01,0.1,1,10]}

In [443]:
grid3 = GridSearchCV(estimator=logres, param_grid=params_dict1, cv=k_folds, scoring='r2', verbose=1, n_jobs=-1)

In [298]:
y_train = np.array(y_train)
y_train = y_train.ravel()
X_train = np.array(X_train)

In [444]:
grid_result3 = grid3.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  5.2min finished


In [446]:
print('Best Score: %s' % grid_result3.best_score_)
print('Best Hyperparameters: %s' % grid_result3.best_params_)

Best Score: 0.06994545332897614
Best Hyperparameters: {'C': 0.1, 'penalty': 'l2'}


In [447]:
y_pred2 = grid_result3.predict(X_test)

In [459]:
from sklearn.metrics import r2_score

print(r2_score(y_test, y_pred2))

print("Accuracy: ",accuracy_score(y_test,y_pred2))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred2))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred2))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred2)))

0.06551976268407589
Accuracy:  0.043576867001457964
Mean Absolute Error: 17.648793131378586
Mean Squared Error: 2132.0042118904908
Root Mean Squared Error: 46.17363113174543


### Random Forest

In [364]:
from sklearn.ensemble import RandomForestClassifier

In [419]:
clf1 = RandomForestClassifier(n_estimators=100)

In [420]:
X_test.shape

(6173, 76)

In [421]:
clf1.fit(X_train,y_train)

RandomForestClassifier()

In [422]:
y_pred_RF = clf1.predict(X_test)

In [460]:
from sklearn import metrics

print(r2_score(y_test, y_pred_RF))

print("Accuracy: ",accuracy_score(y_test,y_pred_RF))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_RF))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_RF))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_RF)))

-0.2820646617859799
Accuracy:  0.033695123926777905
Mean Absolute Error: 21.05912846265997
Mean Squared Error: 2925.013445650413
Root Mean Squared Error: 54.08339343689903


### GridSearch with Random Forest

In [427]:
param_grid_RF = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

In [428]:
CV_rfc = GridSearchCV(estimator=clf1, param_grid=param_grid_RF, cv= 5)

In [429]:
CV_rfc.fit(X_train, y_train)



GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 5, 6, 7, 8],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [200, 500]})

In [430]:
CV_rfc.best_params_

{'criterion': 'entropy',
 'max_depth': 4,
 'max_features': 'auto',
 'n_estimators': 200}

In [431]:
pred=CV_rfc.predict(X_test)

In [435]:
from sklearn.metrics import accuracy_score
print("Accuracy for Random Forest on CV data: ",accuracy_score(y_test,pred))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, pred)))

Accuracy for Random Forest on CV data:  0.048760732220962257
Mean Absolute Error: 17.818078729953022
Mean Squared Error: 2163.8896808682975
Root Mean Squared Error: 46.517627635857544


### XGBoost 

In [439]:
import xgboost as xgb


param_grid_XG = {
    'objective': ['reg:squarederror'],
    'colsample_by_tree': [0.3, 0.4, 0.5, 0.6, 0.7],
    'learning_rate': [0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 1],
    'max_depth': [3, 4, 5, 6, 7, 10, 15],
    'alpha': [1, 2, 3, 5, 7, 10],
    'n_estimators': [5, 10, 15, 20, 30, 50]
}
xg_reg_op = xgb.XGBRegressor()

In [442]:
grid_XG = GridSearchCV(estimator=xg_reg_op, param_grid=param_grid_XG, scoring='r2', cv=5, verbose=1, n_jobs=-1)

In [453]:
grid_XG.fit(X_train, y_train)

Fitting 5 folds for each of 8820 candidates, totalling 44100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 14.6min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 20.1min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed: 26.8min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 33.9min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed: 41.8min
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed: 50.8min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed: 60.8min
[Parallel(n_jobs=-1)]: Done 8434 tasks      | elapsed: 71.7min
[Parallel(n_jobs=-1)]: Done 9784 tasks      | elapsed: 82.9min
[Parallel(n_jobs=-1)]: Done 11234 tasks      |

Parameters: { colsample_by_tree } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=...
                                    scale_pos_weight=None, subsample=None,
                                    tree_method=None, validate_parameters=None,
                                    verbosity=None),
             n_jobs=-1,
             param_grid={'alpha': [1, 2, 3, 5, 7, 10],
     

In [461]:
y_pred_XG = grid_XG.predict(X_test)

### Predict FLIGHTS_TEST SET

In [522]:
X = pd.read_csv('flights_test_submission.csv')

### Convert to an array

In [524]:
X = np.array(X)

In [525]:
pred_XG_final = grid_XG.predict(X)

In [537]:
pred_Final = list(pred_XG_final)

In [532]:
X_final = pd.read_csv('flights_test_data.csv')

In [533]:
X_final.columns

Index(['fl_date', 'mkt_unique_carrier', 'branded_code_share', 'mkt_carrier',
       'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name',
       'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time',
       'crs_arr_time', 'dup', 'crs_elapsed_time', 'flights', 'distance',
       'avg_carrier_delays', 'avg_weather_delays', 'avg_nas_delays',
       'avg_security_delays', 'avg_late_aircraft_delays', 'avg_dep_delays',
       'temp', 'wind_speed', 'weather_desc', 'day_of_week'],
      dtype='object')

In [535]:
X_final.drop(['avg_carrier_delays', 'avg_weather_delays', 'avg_nas_delays',
       'avg_security_delays', 'avg_late_aircraft_delays', 'avg_dep_delays',
       'temp', 'wind_speed', 'weather_desc', 'day_of_week'], axis = 1, inplace=True)

In [548]:
X_final['predictions'] = pd.Series(pred_Final, index=X_final.index)

In [550]:
X_final.to_csv('final_predictions.csv', index=False)

In [468]:

print(r2_score(y_test, y_pred_XG))


print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_XG))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_XG))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_XG)))

0.2752239695954344
Mean Absolute Error: 17.14818238312125
Mean Squared Error: 1653.5668575914492
Root Mean Squared Error: 40.66407330299621


### Linear Regression

In [467]:
from sklearn.linear_model import LinearRegression

scaler = StandardScaler()
model = LinearRegression().fit(X_train, y_train)
y_pred_L = model.predict(X_test)

print(r2_score(y_test, y_pred_L))

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_L))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_L))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_L)))

-4.67954824386383e+17
Mean Absolute Error: 23395824527.798347
Mean Squared Error: 1.0676326975430221e+21
Root Mean Squared Error: 32674649157.152737


### Compilation of different results

In [510]:
MSE_dict = {'Models': ['Logistic Regression', 'Random Forest', 'XGBoost'],
            'MSE':[2108, 2163, 1654]}

In [515]:
RMSE_dict = {'models': ['Logistic Regression', 'Random Forest', 'XGBoost'],
    'RMSE':[46, 47, 41]}

In [511]:
MSE_df = pd.DataFrame(MSE_dict)

In [512]:
MSE_df

Unnamed: 0,Models,MSE
0,Logistic Regression,2108
1,Random Forest,2163
2,XGBoost,1654


In [516]:
RMSE_df = pd.DataFrame(RMSE_dict)

In [517]:
RMSE_df

Unnamed: 0,models,RMSE
0,Logistic Regression,46
1,Random Forest,47
2,XGBoost,41
