In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso

In [2]:
# settings to display all columns
pd.set_option("display.max_columns", None)

# Linear Regression

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

In [3]:
X_train = pd.read_csv('data/X_train.csv')

In [4]:
y_train = pd.read_csv('data/y_train.csv')

In [5]:
X_test = pd.read_csv('data/X_test.csv')

In [6]:
y_test = pd.read_csv('data/y_test.csv')

In [7]:
X_train.sample(3)

Unnamed: 0,mkt_op_carrier_difference,Type_Cold,Type_Fog,Type_Hail,Type_Precipitation,Type_Rain,Type_Snow,dow_Friday,dow_Monday,dow_Saturday,dow_Sunday,dow_Thursday,dow_Tuesday,arr_afternoon,arr_dawn,arr_evening,arr_morning,arr_noon,dep_afternoon,dep_dawn,dep_evening,dep_morning,dep_noon,aircraft_4,aircraft_6,muc_AS,muc_B6,muc_DL,muc_F9,muc_G4,muc_HA,muc_NK,muc_UA,muc_WN,Passengers_Seat_Ratio,distance,Taxi_Holdup,crs_elapsed_time,origin_0,origin_1,origin_2,origin_3,origin_4,origin_5,origin_6,origin_7,origin_8,origin_9,dest_0,dest_1,dest_2,dest_3,dest_4,dest_5,dest_6,dest_7,dest_8,dest_9
5169,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0.876284,-0.322342,0.966817,-0.299532,1.0,-4.0,3.0,3.0,-1.0,2.0,-1.0,0.0,-1.0,0.0,2.0,-2.0,5.0,2.0,-1.0,1.0,1.0,0.0,-1.0,-2.0
138151,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1.181458,-0.648454,-0.598372,-0.597251,2.0,-3.0,1.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,2.0,-1.0,4.0,0.0,-1.0,0.0,0.0,0.0,0.0,-1.0
64086,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0.626744,-0.648454,-0.740662,-0.469657,2.0,-1.0,1.0,0.0,-1.0,0.0,0.0,-2.0,0.0,1.0,1.0,-3.0,0.0,1.0,-3.0,-1.0,1.0,0.0,0.0,-2.0


In [8]:
# Initialise the model
lr = LinearRegression()

In [9]:
# Fit the model to the data
lr.fit(X_train, y_train)

LinearRegression()

In [10]:
#Check the training score:
lr.score(X_train, y_train) # this just checks the score (this is the R^2 value)

0.07160622149967055

In [11]:
lr.predict(X_train)[:10] # this is not necessary as this just does the prediction using the X_train features
                         # this is the delay in minutes for the first 10 rows 

array([[  0.96224762],
       [  1.15368418],
       [ 17.15211787],
       [-13.51510529],
       [ -1.73210537],
       [  2.21922824],
       [ -6.99713447],
       [ -3.45490444],
       [ -2.19353626],
       [ 17.17051145]])

In [12]:
lr_coeffs = lr.coef_
len(lr_coeffs)
lr_coeffs

array([[-4.39991365e+00, -1.52395058e+01, -1.78072020e+01,
        -2.51306239e+01, -3.25606848e+01, -1.37053622e+01,
        -1.15544603e+01, -7.70041713e-01, -8.99248843e-01,
        -1.95131539e+00, -1.00798580e+00,  1.04542839e+00,
        -3.69176348e+00,  2.59689232e-01, -2.12919042e+00,
         2.09262725e+00, -3.56481180e+00, -2.38034525e+00,
         3.07507496e+00, -3.43431080e+00,  3.05555990e+00,
        -4.01771247e+00,  4.55321815e-01,  2.10326707e+00,
         2.38129810e+00, -3.53828512e+00,  6.63038059e+00,
        -5.66996539e+00,  5.17717633e+00,  9.92744479e+00,
         2.37937645e+01,  1.92457405e+00,  4.64200033e+00,
        -1.87381154e+00,  8.93586202e-03,  5.80452825e+00,
         1.12442491e+01, -8.36100938e+00, -3.65738901e-01,
        -2.48138432e-01, -8.50309593e-01, -1.27464081e+00,
         3.59531571e-01,  6.65794139e-01, -1.72362457e-01,
        -1.42689000e+00, -2.52520315e+00,  2.57776758e-01,
        -3.96567353e-01,  1.12085024e-01, -1.98688808e-0

In [13]:
lr_intercept = lr.intercept_
lr_intercept

array([24.66454457])

In [14]:
words_coeffs_df = pd.DataFrame(data = lr_coeffs.T, index = X_train.columns, columns=['Coefficients'])
# words_coeffs_df
words_coeffs_df.sort_values(by=['Coefficients'], ascending = True )

Unnamed: 0,Coefficients
Type_Precipitation,-32.560685
Type_Hail,-25.130624
Type_Fog,-17.807202
Type_Cold,-15.239506
Type_Rain,-13.705362
Type_Snow,-11.55446
crs_elapsed_time,-8.361009
muc_DL,-5.669965
mkt_op_carrier_difference,-4.399914
dep_morning,-4.017712


In [15]:
#Check the test score:
lr.score(X_test, y_test) # this just checks the score (this is the R^2 value)

0.06915289644243572

# Another approach: Using sklearn cross-validation

In [16]:
from sklearn.model_selection import cross_val_score
# Using cross validation:
cv_score = cross_val_score(lr, X_train, y_train, cv=5, scoring = 'r2' )
cv_score

array([0.06629493, 0.06691632, 0.07556299, 0.0727952 , 0.07334473])

In [17]:
print(f"Average cross-validation score = {np.mean(cv_score):.2f}")

Average cross-validation score = 0.07


# Another approach: Trying Ridge

In [18]:
# Initialise the model
ridge = Ridge()

In [19]:
# Fit the model to the data
ridge.fit(X_train, y_train)

Ridge()

In [20]:
#Check the training score:
ridge.score(X_train, y_train) # this just checks the score (this is the R^2 value)

0.07160597961222881

In [21]:
#Check the test score:
ridge.score(X_test, y_test) # this just checks the score (this is the R^2 value)

0.06915452126682442

In [22]:
cv_score_ridge = cross_val_score(ridge, X_train, y_train, cv=5, scoring = 'r2' )
cv_score_ridge

array([0.06628611, 0.06692409, 0.07556877, 0.07279764, 0.07335027])

In [23]:
print(f"Average Ridge cross-validation score = {np.mean(cv_score_ridge):.2f}")

Average Ridge cross-validation score = 0.07


In [24]:
ridge_coeffs = ridge.coef_
ridge_coeffs

array([[-4.39885314e+00, -1.47853234e+01, -1.73568059e+01,
        -2.45968678e+01, -2.94303730e+01, -1.32554410e+01,
        -1.11039869e+01, -7.69755440e-01, -8.99419695e-01,
        -1.95141701e+00, -1.00720886e+00,  1.04746121e+00,
        -3.69099073e+00,  2.61150800e-01, -2.12672871e+00,
         2.09370271e+00, -3.56365962e+00, -2.37882076e+00,
         3.07547660e+00, -3.43265286e+00,  3.05652754e+00,
        -4.01659965e+00,  4.55901432e-01,  2.09837914e+00,
         2.37740869e+00, -3.53986260e+00,  6.62663040e+00,
        -5.67091979e+00,  5.17325583e+00,  9.91980489e+00,
         2.37094766e+01,  1.92205625e+00,  4.64181351e+00,
        -1.87482760e+00,  8.91178266e-03,  5.80548761e+00,
         1.12439212e+01, -8.36169302e+00, -3.65760696e-01,
        -2.47905247e-01, -8.49919885e-01, -1.27445568e+00,
         3.60202386e-01,  6.67293097e-01, -1.73347686e-01,
        -1.42699620e+00, -2.52459485e+00,  2.58122526e-01,
        -3.96859204e-01,  1.12328814e-01, -1.98421021e-0

In [25]:
ridge_intercept = ridge.intercept_
ridge_intercept

array([24.21660992])

In [26]:
words_coeffs_df = pd.DataFrame(data = ridge.coef_.T, index = X_train.columns, columns=['Coefficients'])
# words_coeffs_df
words_coeffs_df.sort_values(by=['Coefficients'], ascending = True )

Unnamed: 0,Coefficients
Type_Precipitation,-29.430373
Type_Hail,-24.596868
Type_Fog,-17.356806
Type_Cold,-14.785323
Type_Rain,-13.255441
Type_Snow,-11.103987
crs_elapsed_time,-8.361693
muc_DL,-5.67092
mkt_op_carrier_difference,-4.398853
dep_morning,-4.0166


# Another approach: Trying Lasso

In [27]:
# Initialise the model
lasso = Lasso()

In [28]:
# Fit the model to the data
lasso.fit(X_train, y_train)

Lasso()

In [29]:
#Check the training score:
lasso.score(X_train, y_train) # this just checks the score (this is the R^2 value)

0.05709468378693927

In [30]:
#Check the test score:
lasso.score(X_test, y_test) # this just checks the score (this is the R^2 value)

0.055762320545167166

In [31]:
cv_score_lasso = cross_val_score(lasso, X_train, y_train, cv=5, scoring = 'r2' )
cv_score_lasso

array([0.05316048, 0.05385274, 0.06067891, 0.05799994, 0.05889642])

In [32]:
print(f"Average Lasso cross-validation score = {np.mean(cv_score_lasso):.2f}")

Average Lasso cross-validation score = 0.06


In [33]:
lasso_coeffs = lasso.coef_
lasso_coeffs

array([-0.20482718, -0.        , -0.        , -0.        , -0.        ,
        0.        ,  0.        ,  0.        ,  0.        , -0.        ,
        0.        ,  0.        , -0.        ,  0.        ,  0.        ,
        0.65426352, -0.        , -0.        ,  0.        , -0.        ,
        0.        , -3.97554744, -0.        , -0.        ,  0.        ,
       -0.        ,  0.        , -0.7002003 ,  0.        ,  0.        ,
        0.        ,  0.        ,  1.6880172 , -0.        , -0.        ,
       -0.        , 10.56734609, -1.59204587, -0.        , -0.13071666,
       -1.16854133, -0.        ,  0.        ,  0.        , -0.        ,
       -0.62647854, -0.        ,  0.28364105, -0.        , -0.        ,
       -0.150307  ,  0.        ,  0.        ,  0.        ,  0.        ,
       -2.04274934, -0.        , -0.        ])

In [34]:
lasso_intercept = lasso.intercept_
lasso_intercept

array([10.20746376])

In [35]:
words_coeffs_df = pd.DataFrame(data = lasso.coef_.T, index = X_train.columns, columns=['Coefficients'])
words_coeffs_df.sort_values(by=['Coefficients'], ascending = True )

Unnamed: 0,Coefficients
dep_morning,-3.975547
dest_7,-2.042749
crs_elapsed_time,-1.592046
origin_2,-1.168541
muc_DL,-0.7002
origin_7,-0.626479
mkt_op_carrier_difference,-0.204827
dest_2,-0.150307
origin_1,-0.130717
dest_1,-0.0


# Logistic Regression

In [51]:
y_train_binary = pd.read_csv('data/y_train_binary.csv')

In [68]:
y_train_binary_list = y_train_binary['delayed'].tolist()

In [69]:
y_train_binary_list

[False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 True,
 True,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 True,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 True,
 True,
 True,
 True,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 True,
 False,
 True,
 False,
 True,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 False,
 True,
 True,
 False,
 True,
 False,
 False,
 False,
 True,
 False,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 Fa

In [52]:
y_test_binary = pd.read_csv('data/y_test_binary.csv')

In [66]:
y_test_binary_list = y_test_binary['delayed'].tolist()

In [67]:
y_test_binary_list

[True,
 False,
 True,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 False,
 True,
 False,
 True,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 True,
 False,
 True,
 False,
 True,
 False,
 True,
 True,
 False,
 False,
 True,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 True,
 False,
 False,
 False,
 True,
 False,
 True,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 False,
 True,
 True,
 False,
 True,
 False,
 False,
 False,
 True,
 False,
 True,
 False,
 False,
 Fal

In [53]:
logr = LogisticRegression()

In [70]:
logr.fit(X_train, y_train_binary_list)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [72]:
#Check the training score:
logr.score(X_train, y_train_binary_list) # this just checks the score (this is the R^2 value)

0.7068922371399806

In [73]:
#Check the test score:
logr.score(X_test, y_test_binary_list) # this just checks the score (this is the R^2 value)

0.7072226824002281

In [75]:
cv_score_ridge = cross_val_score(logr, X_train, y_train_binary_list, cv=5)
cv_score_ridge

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

array([0.70745192, 0.70511552, 0.7075598 , 0.70508642, 0.70723971])

In [76]:
print(f"Average logr cross-validation score = {np.mean(cv_score_ridge):.2f}")

Average logr cross-validation score = 0.71


In [77]:
print("Model weights: %s"%(logr.coef_)) # these are weights
print("Model intercept: %s"%(logr.intercept_)) # this is the bias term
data = {'features': X_train.columns, 'coefficients':logr.coef_[0]}
pd.DataFrame(data)
pd.DataFrame(data).sort_values(by=['coefficients'], axis=0, ascending=False)


Model weights: [[-0.14623825 -0.07594227 -0.13655551 -0.51969922 -0.0316483   0.08220359
   0.21389849  0.08883621 -0.10089635 -0.12879455  0.02189345  0.08951321
  -0.17131239  0.02944341 -0.4243148   0.13456212 -0.121905   -0.04151162
   0.21623174 -0.37602817  0.19297586 -0.23521293  0.06721118  0.12862472
  -0.21904232 -0.10691371  0.06762026 -0.59285812  0.18882991  0.56331418
   0.91239672 -0.11536557 -0.11730118 -0.02004501  0.08974413  0.49885448
   0.79319426 -0.60359893  0.00141838 -0.00214308 -0.03453092 -0.0997319
   0.00882042  0.03767703  0.01292938 -0.05440517 -0.04568615 -0.00737659
  -0.02804813  0.0014243   0.01053376  0.01375962  0.04818163 -0.00328016
   0.00708906 -0.10434953 -0.13514416 -0.03022869]]
Model intercept: [-0.09499891]


Unnamed: 0,features,coefficients
30,muc_HA,0.912397
36,Taxi_Holdup,0.793194
29,muc_G4,0.563314
35,distance,0.498854
18,dep_afternoon,0.216232
6,Type_Snow,0.213898
20,dep_evening,0.192976
28,muc_F9,0.18883
15,arr_evening,0.134562
23,aircraft_4,0.128625
