In [19]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, train_test_split
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb

In [2]:
pd.options.display.max_rows = 60
pd.options.display.max_columns = 60

In [3]:
X1 = pd.read_pickle("pickle/X1_cxl.pick")
y1 = pd.read_pickle("pickle/y1_cxl.pick")

X2 = pd.read_pickle("pickle/X2_cxl.pick")
y2 = pd.read_pickle("pickle/y2_cxl.pick")

In [4]:
X1_train, X1_test, y1_train, y1_test = \
    train_test_split(X1, y1, test_size=0.33, random_state=42)

In [5]:
# ss1 = StandardScaler()
# ss1.fit(X1_train)

In [7]:
lr1 = LinearRegression()
lr1.fit(X1_train, y1_train)
lr1.score(X1_test, y1_test)

0.3797535358429306

In [9]:
lr1.coef_

array([ 9.07060624e-04,  9.73004655e-03,  1.22944625e-02, -2.56441597e-03,
        5.66397414e-04,  9.96054581e-03,  1.16810423e-02,  2.08767912e-02,
       -2.25972877e-02, -7.50059823e-02,  2.49971047e-03,  1.15591423e-02,
       -9.05943183e-03, -4.23910188e-02, -2.53869017e-03, -2.53887127e-01,
       -1.19540237e-01,  4.26854133e-02, -5.08878458e-02,  1.66730507e-02,
        1.01589388e-01,  1.65289417e-02,  6.31056975e-01, -8.13692671e-02,
       -1.02807720e-01, -1.37259740e-02, -1.91446233e-01,  3.63256343e-02,
       -6.50399924e-02, -1.42997292e-02, -1.10557793e-01,  2.55979717e-01,
       -7.54237486e-02,  7.46165616e-02, -5.38788953e-02, -8.10565462e-03,
       -9.26346650e-02, -9.84505755e-02, -3.72808776e-02,  9.78232515e-03,
        1.22096359e-02,  1.31861772e-02,  3.97317202e-01,  1.73048574e-02,
       -1.11601749e-01, -2.72972861e-02, -3.42091163e-02])

In [24]:
def regularization(X, y):
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=.25, random_state=42)
    
    # standardize data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train.values)
    X_val_scaled = scaler.transform(X_val)
    
    # linear regression
    lr = LinearRegression()
    lr_model = lr.fit(X_train_scaled, y_train)
    lr_score = lr_model.score(X_val_scaled, y_val)
    
    # ridge
    ridge = RidgeCV(cv=5)
    ridge_model = ridge.fit(X_train_scaled, y_train)
    ridge_score = ridge_model.score(X_val_scaled, y_val)
    
    # lasso
    lasso = LassoCV(cv=5)
    lasso_model = lasso.fit(X_train_scaled, y_train)
    lasso_score = lasso_model.score(X_val_scaled, y_val)
    
    print('\nSIMPLE LR Validation R^2 score was:', lr_score)
    print('Feature coefficient results: \n')
    for feature, coef in zip(X.columns, lr_model.coef_):
        print(feature, ':', f'{coef:.2f}') 
    
    print('\nRIDGE Validation R^2 score was:', ridge_score)
    print('Feature coefficient results: \n')
    for feature, coef in zip(X.columns, ridge_model.coef_):
        print(feature, ':', f'{coef:.2f}') 
        
    print('\nLASSO Validation R^2 score was:', lasso_score)
    print('Feature coefficient results: \n')
    for feature, coef in zip(X.columns, lasso_model.coef_):
        print(feature, ':', f'{coef:.2f}') 
    

regularization(X1, y1)
    
    


SIMPLE LR Validation R^2 score was: -383157604.8321402
Feature coefficient results: 

LeadTime : 0.09
LOS : 39514101215.38
StaysInWeekendNights : -13388905855.39
StaysInWeekNights : -28763767665.95
ADR : 0.03
NumPeople : 84185478334.28
Adults : -69164981654.69
Children : -41688257865.90
Babies : -11202168493.07
TotalOfSpecialRequests : -0.06
PreviousBookings : -174545998008.86
PreviousCancellations : 137343113400.66
PreviousBookingsNotCanceled : 104337409243.47
BookingChanges : -0.03
DaysInWaitingList : -0.02
RequiredCarParkingSpaces : -0.09
IsRepeatedGuest : -0.02
AgencyBooking : 0.02
CompanyListed : -0.01
CT_is_grp : 0.00
CT_is_trn : 0.04
CT_is_trnP : 0.01
RS_No-Show : 0.05
MS_Corporate : -0.02
MS_Direct : -0.04
MS_Groups : -0.01
MS_Offline TA/TO : -0.07
MS_Online TA : 0.02
DC_Direct : -0.03
DC_TA/TO : -0.01
DC_Undefined : 786223.48
FROM_PRT : 37580611613.35
FROM_GBR : 28657847021.93
FROM_ESP : 22536915421.59
FROM_IRL : 17346072264.62
FROM_FRA : 14548358817.09
FROM_DEU : 12552744329

In [21]:
ridge_model.coef_

array([ 9.02521250e-04,  9.47910601e-03,  1.16310146e-02, -2.15190856e-03,
        5.67002202e-04,  1.18390726e-02,  8.89351717e-03,  1.99966559e-02,
       -1.70511005e-02, -7.56827667e-02,  2.19541580e-03,  1.11225345e-02,
       -8.92711868e-03, -4.22106222e-02, -2.59027499e-03, -2.54101301e-01,
       -1.18398124e-01,  4.49754725e-02, -4.55006012e-02,  1.24241392e-02,
        1.02807376e-01,  2.01452649e-02,  6.22241402e-01, -9.42689098e-02,
       -1.06907089e-01, -2.57769056e-02, -1.94655483e-01,  3.49373561e-02,
       -6.24131215e-02, -1.56878613e-02, -5.31974272e-02,  2.55923601e-01,
       -7.25718163e-02,  7.21377405e-02, -5.69524506e-02, -7.82270747e-03,
       -8.66831248e-02, -9.83553561e-02, -3.11125667e-02,  7.49101535e-04,
        9.85525231e-03,  1.48323269e-02,  4.03952489e-01,  1.60350060e-02,
       -1.11796578e-01, -2.78586094e-02, -2.42647671e-02])

In [25]:
regularization(X2, y2)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').