In [37]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBClassifier

In [2]:
pd.options.display.max_rows = 60
pd.options.display.max_columns = 60

In [3]:
X1 = pd.read_pickle("pickle/X1_cxl.pick")
y1 = pd.read_pickle("pickle/y1_cxl.pick")

X2 = pd.read_pickle("pickle/X2_cxl.pick")
y2 = pd.read_pickle("pickle/y2_cxl.pick")

In [4]:
X1_train, X1_test, y1_train, y1_test = \
    train_test_split(X1, y1, test_size=0.33, random_state=42)

In [35]:
def regularization(X, y):
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=.25, random_state=42)
    
    # standardize data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train.values)
    X_val_scaled = scaler.transform(X_val)
    
    # logistic regression
    lr = LogisticRegression()
    lr_model = lr.fit(X_train_scaled, y_train)
    lr_score = lr_model.score(X_val_scaled, y_val)
    
    # ridge
    ridge = RidgeCV(cv=5)
    ridge_model = ridge.fit(X_train_scaled, y_train)
    ridge_score = ridge_model.score(X_val_scaled, y_val)
    
    # lasso
    lasso = LassoCV(cv=5)
    lasso_model = lasso.fit(X_train_scaled, y_train)
    lasso_score = lasso_model.score(X_val_scaled, y_val)
    
    print('\nSIMPLE LR Validation R^2 score was:', lr_score)
    print('Feature coefficient results: \n')
    print('Test score: ', lr_model.score(X_test, y_test))
#     for feature, coef in zip(X.columns, lr_model.coef_):
#         print(feature, ':', f'{coef:.2f}') 
    
    print('\nRIDGE Validation R^2 score was:', ridge_score)
    print('Feature coefficient results: \n')
#     for feature, coef in zip(X.columns, ridge_model.coef_):
#         print(feature, ':', f'{coef:.2f}') 
        
    print('\nLASSO Validation R^2 score was:', lasso_score)
    print('Feature coefficient results: \n')
#     for feature, coef in zip(X.columns, lasso_model.coef_):
#         print(feature, ':', f'{coef:.2f}') 
    

regularization(X1, y1)
    
    


SIMPLE LR Validation R^2 score was: 0.8419870194707938
Feature coefficient results: 

Test score:  0.28145282076884676

RIDGE Validation R^2 score was: 0.3933095833969261
Feature coefficient results: 


LASSO Validation R^2 score was: 0.3931621181422571
Feature coefficient results: 



In [21]:
ridge_model.coef_

array([ 9.02521250e-04,  9.47910601e-03,  1.16310146e-02, -2.15190856e-03,
        5.67002202e-04,  1.18390726e-02,  8.89351717e-03,  1.99966559e-02,
       -1.70511005e-02, -7.56827667e-02,  2.19541580e-03,  1.11225345e-02,
       -8.92711868e-03, -4.22106222e-02, -2.59027499e-03, -2.54101301e-01,
       -1.18398124e-01,  4.49754725e-02, -4.55006012e-02,  1.24241392e-02,
        1.02807376e-01,  2.01452649e-02,  6.22241402e-01, -9.42689098e-02,
       -1.06907089e-01, -2.57769056e-02, -1.94655483e-01,  3.49373561e-02,
       -6.24131215e-02, -1.56878613e-02, -5.31974272e-02,  2.55923601e-01,
       -7.25718163e-02,  7.21377405e-02, -5.69524506e-02, -7.82270747e-03,
       -8.66831248e-02, -9.83553561e-02, -3.11125667e-02,  7.49101535e-04,
        9.85525231e-03,  1.48323269e-02,  4.03952489e-01,  1.60350060e-02,
       -1.11796578e-01, -2.78586094e-02, -2.42647671e-02])

In [25]:
regularization(X2, y2)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

## XGBoost (Hotel 1): Hyperparameter Tuning with Grid Search

In [48]:
estimator = XGBClassifier(objective='binary:logistic',
                          use_label_encoder=False,
                          eval_metric='logloss',
                          random_state=42,
)

params = {
    'learning_rate': [0.001, 0.01, 0.1],
    'max_depth': range(2,12),
    'n_estimators': range(100,1000, 100),
    'colsample_bytree': np.arange(0.1, 1, 0.1),
    'min_child_weight': range(1, 12)}

grid_search_1 = GridSearchCV(
    estimator = estimator,
    param_grid = params,
    n_jobs=-1,
    verbose=10
)

In [49]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2,random_state=42)

In [50]:
grid_xgb_fit_1 = grid_search_1.fit(X1_train, y1_train)
print("The best parameters are: \n", grid_xgb_fit_1.best_params_)

Fitting 5 folds for each of 26730 candidates, totalling 133650 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   21.2s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   36.4s
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:   42.8s
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:   51.6s
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 121 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 157 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 197 tasks      | elapsed:  

KeyboardInterrupt: 

## XGBoost (Hotel 2): Hyperparameter Tuning with Grid Search

In [None]:
estimator = XGBClassifier(objective='binary:logistic',
                          use_label_encoder=False,
                          eval_metric='logloss',
                          random_state=42,
)

params = {
    'learning_rate': [0.001, 0.01, 0.1],
    'max_depth': range(2,12),
    'n_estimators': range(100,1000, 100),
    'colsample_bytree': np.arange(0.1, 1, 0.1),
    'min_child_weight': range(1, 12)}

grid_search_2 = GridSearchCV(
    estimator = estimator,
    param_grid = params,
    n_jobs=-1,
    verbose=10
)

In [44]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2,random_state=42)

In [None]:
grid_xgb_fit_2 = grid_search_2.fit(X2_train, y2_train)
print("The best parameters are: \n", grid_xgb_fit_2.best_params_)