In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('table_for_model.csv')

In [5]:
df

Unnamed: 0,origin_city_name,arr_delay,month,carrier_name,avg_daily_delay_for_month,sum_delay_for_month,monthly_passenger_carrier,monthly_passenger_airport
0,"Chicago, IL",36.0,1,Southwest Airlines,0.659208,383.0,345355.0,158843.000000
1,"Chicago, IL",41.0,1,Southwest Airlines,0.659208,383.0,345355.0,158843.000000
2,"Chicago, IL",9.0,1,Southwest Airlines,0.659208,383.0,345355.0,158843.000000
3,"Chicago, IL",-5.0,1,Southwest Airlines,0.659208,383.0,345355.0,158843.000000
4,"Chicago, IL",14.0,1,Southwest Airlines,0.659208,383.0,345355.0,158843.000000
...,...,...,...,...,...,...,...,...
30371,"Salisbury, MD",-7.0,7,Piedmont Airlines,-4.250000,-119.0,908.0,114726.459319
30372,"Salisbury, MD",8.0,7,Piedmont Airlines,-4.250000,-119.0,908.0,114726.459319
30373,"Lynchburg, VA",81.0,7,Piedmont Airlines,-4.250000,-119.0,908.0,114726.459319
30374,"Pullman, WA",3.0,7,Horizon Airlines,-7.689655,-223.0,2055.0,114726.459319


In [6]:
X_num = df.loc[:, ['month', 'avg_daily_delay_for_month', 'sum_delay_for_month',
              'monthly_passenger_carrier', 'monthly_passenger_airport']]
y = df.arr_delay

In [8]:
print(X_num.shape)
print(y.shape)

(30376, 5)
(30376,)


In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_num, y, test_size = 0.3, random_state=5)

In [11]:
from sklearn.preprocessing import StandardScaler

In [12]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# XGBoost

In [13]:
import xgboost as xgb

In [16]:
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', colsample_by_tree=0.3,
                         learning_rate=0.1, max_depth=5, alpha=10,
                         n_estimators=10)

In [17]:
xg_reg.fit(X_train_scaled, y_train)

preds = xg_reg.predict(X_test_scaled)

Parameters: { colsample_by_tree } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [19]:
y_test

13962      7.0
26642    -11.0
3727     -27.0
15808    199.0
13379    -10.0
         ...  
7777     -21.0
19220      2.0
15778    -20.0
19150    112.0
20380     -9.0
Name: arr_delay, Length: 9113, dtype: float64

In [18]:
preds

array([ 8.971254  ,  0.21211979,  7.497592  , ..., -0.5841128 ,
       20.863867  , -1.2256587 ], dtype=float32)

In [20]:
from sklearn.metrics import mean_squared_error

In [22]:
mse = mean_squared_error(y_test, preds)
print('MSE: %f' %(mse))

MSE: 2274.558173


In [33]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print('RMSE: %f' %(rmse))

RMSE: 47.692328


In [23]:
from sklearn.metrics import r2_score

In [25]:
r2_score_xgb = r2_score(y_test, preds)
print(r2_score_xgb)

0.02109804548747818


# Trying gridsearch

In [37]:
from sklearn.model_selection import GridSearchCV

In [41]:
param_grid = {
    'objective': ['reg:squarederror'],
    'colsample_by_tree': [0.3, 0.4, 0.5, 0.6, 0.7],
    'learning_rate': [0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 1],
    'max_depth': [3, 4, 5, 6, 7, 10, 15],
    'alpha': [1, 2, 3, 5, 7, 10],
    'n_estimators': [5, 10, 15, 20, 30, 50]
}

xg_reg_op = xgb.XGBRegressor()

In [42]:
grid = GridSearchCV(estimator=xg_reg_op, param_grid=param_grid, scoring='r2', cv=10, verbose=1, n_jobs=-1)

In [43]:
grid.fit(X_train_scaled, y_train)

Fitting 10 folds for each of 8820 candidates, totalling 88200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   33.4s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed: 14.2min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 18.0min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed: 22.5min
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed: 27.2min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed: 33.2min
[Parallel(n_jobs=-1)]: Done 8434 tasks      | elapsed: 38.6min
[Parallel(n_jobs=-1)]: Done 9784 tasks      | elapsed: 44.7min
[Parallel(n_jobs=-1)]: Done 11234 tasks      |

Parameters: { colsample_by_tree } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




GridSearchCV(cv=10,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs...
                                    scale_pos_weight=None, subsample=None,
                                    tree_method=None, validate_parameters=None,
                                    verbosity=None),
             n_jobs=-1,
             param_grid={'alpha': [1, 2, 3, 5, 7, 10],
     

In [44]:
print(grid.best_score_)
print(grid.best_params_)

0.03361075720252189
{'alpha': 3, 'colsample_by_tree': 0.3, 'learning_rate': 0.5, 'max_depth': 3, 'n_estimators': 5, 'objective': 'reg:squarederror'}


In [45]:
xgb_pred = grid.predict(X_test_scaled)

In [47]:
mse2 = mean_squared_error(y_test, xgb_pred)
print('MSE: %f' %(mse2))

MSE: 2274.736668


# Logistic regression

In [48]:
from sklearn.linear_model import LogisticRegression

In [51]:
params_dict1 = {'penalty' : ['l1','l2'], 'C' : [0.001,0.01,0.1,1,10]}
logres = LogisticRegression()

grid2 = GridSearchCV(estimator=logres, param_grid=params_dict1, cv=10, scoring='r2', verbose=1, n_jobs=-1)

In [53]:
grid2.fit(X_train_scaled, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 10.0min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


GridSearchCV(cv=10, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10],
                         'penalty': ['l1', 'l2']},
             scoring='r2', verbose=1)

In [55]:
print(grid2.best_score_)
print(grid2.best_params_)

-0.0952523238438798
{'C': 0.1, 'penalty': 'l2'}


In [59]:
log_pred = grid2.predict(X_test_scaled)

In [60]:
mse3 = mean_squared_error(y_test, log_pred)
print('MSE: %f' %(mse3))

MSE: 2552.226270


In [61]:
models = ['LogisticRegression', 'XGBoost', 'Ridge', 'Elasticnet', 'Lasso' ]
MSE = [2552.226270, 2274.558173, 2265.8796081436567, 2264.373329246114, 2263.8919093898785,  ]

In [65]:
df_results = pd.DataFrame(models)

In [67]:
df_results

Unnamed: 0,0
0,LogisticRegression
1,XGBoost
2,Ridge
3,Elasticnet
4,Lasso


In [68]:
df_results['MSE'] = MSE

In [73]:
type(df_results.columns[0])

int

In [77]:
df_results = df_results.rename(columns={0:'Models'})

In [78]:
df_results

Unnamed: 0,Models,MSE
0,LogisticRegression,2552.22627
1,XGBoost,2274.558173
2,Ridge,2265.879608
3,Elasticnet,2264.373329
4,Lasso,2263.891909
