# Create a model using the lgbm_02_02 hyperparameters

But without using any scaling. Submit this to the competition to check I am comparing apples with apples. 

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import pickle
import lightgbm as lgb

from pathlib import Path
from lightgbm import LGBMClassifier
from pprint import pprint

from sklearn.metrics import mean_squared_error, f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split, RandomizedSearchCV 
from sklearn.model_selection import cross_val_score, StratifiedKFold



############ USE FOR GOOGLE COLAB ############
# DATA_DIR = Path('/content/drive/MyDrive/Work/Delivery/Current/Earthquake_damage/data')
# SUBMISSIONS_DIR = Path('drive/MyDrive/Work/Delivery/Current/Earthquake_damage/submissions')
# MODEL_DIR = Path('/content/drive/MyDrive/Work/Delivery/Current/Earthquake_damage/models')

# from google.colab import drive
# drive.mount('/content/drive')
#############################################


### USE FOR LOCAL JUPYTER NOTEBOOKS ###
DATA_DIR = Path('../download')
SUBMISSIONS_DIR = Path('../submissions')
MODEL_DIR = Path('../models')
#######################################

# The code runs the same if working on Jupyter or Colab, just need to change the 
# dirs above

X = pd.read_csv(DATA_DIR / 'train_values.csv', index_col='building_id')

categorical_columns = X.select_dtypes(include='object').columns
bool_columns = [col for col in X.columns if col.startswith('has')]
X[categorical_columns] = X[categorical_columns].astype('category')
X[bool_columns] = X[bool_columns].astype('bool')

X = pd.get_dummies(X)
y = pd.read_csv(DATA_DIR / 'train_labels.csv', index_col='building_id')

In [2]:
top_14_features = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id', 
                    'count_floors_pre_eq', 'age', 'area_percentage', 
                    'height_percentage', 
                    'has_superstructure_mud_mortar_stone',
                    'has_superstructure_stone_flag', 
                    'has_superstructure_mud_mortar_brick',
                    'has_superstructure_cement_mortar_brick',
                    'has_superstructure_timber', 'count_families',
                    'other_floor_type_q']

In [None]:
model = LGBMClassifier(boosting_type='goss',
                       learning_rate=0.2,
                       min_child_samples=40,
                       n_estimators=240,
                       num_leaves=120,
                       random_state=42)

In [4]:
scores = cross_val_score(model, X[top_14_features], np.ravel(y), scoring='f1_micro',
                         n_jobs=-1, verbose=10)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   49.3s remaining:  1.2min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   49.4s remaining:   33.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   49.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   49.9s finished


In [8]:
# Submission score of 0.7272
scores.mean()

0.7257838562617362

In [7]:
scores.max()

0.7280698388334612

In [15]:
scores

array([0.72705819, 0.72204528, 0.72806984, 0.72622794, 0.72551804])

In [12]:
model.fit(X[top_14_features], np.ravel(y), verbose=10)

LGBMClassifier(boosting_type='goss', learning_rate=0.2, min_child_samples=40,
               n_estimators=240, num_leaves=120, random_state=42)

In [9]:
def make_submission_top_14_features(pipeline, title):
    """
    Given a trained pipeline object, use it to make predictions on the 
    submission test set 'test_values.csv' and write them a csv in the submissions
    folder.
    """
    # Read in test_values csv and apply data preprocessing
    # note: will create a data preprocessing pipeline or function in future
    test_values = pd.read_csv(DATA_DIR / 'test_values.csv', index_col='building_id')
    test_values[categorical_columns] = test_values[categorical_columns].astype('category')
    test_values[bool_columns] = test_values[bool_columns].astype('bool')
    test_values = pd.get_dummies(test_values)
    test_values = test_values[top_14_features]

    # Generate predictions using pipeline we pass in
    predictions = pipeline.predict(test_values)

    submission_format = pd.read_csv(DATA_DIR / 'submission_format.csv',
                                    index_col='building_id')

    my_submission = pd.DataFrame(data=predictions,
                                columns=submission_format.columns,
                                index=submission_format.index)
    
    my_submission.to_csv(SUBMISSIONS_DIR / f'{title}.csv')

In [13]:
make_submission_top_14_features(model, '02-23 - LightGBM - lgbm_02_02 params - random_state=42')

In [14]:
y_pred = model.predict(X[top_14_features])
f1_score(y, y_pred, average='micro')

0.7879785572580305

## Compare with Pipeline and StandardScaler

In [16]:
lgbm = LGBMClassifier(boosting_type='goss',
                       learning_rate=0.2,
                       min_child_samples=40,
                       n_estimators=240,
                       num_leaves=120,
                       random_state=42)

In [32]:
pipe = Pipeline([('scaler', StandardScaler()),
                 ('lgbm', lgbm)])

In [23]:
scores_pipe = cross_val_score(pipe, X[top_14_features], y, scoring='f1_micro',
                              n_jobs=-1, verbose=10)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   48.5s remaining:  1.2min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   48.6s remaining:   32.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   49.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   49.7s finished


In [24]:
scores_pipe

array([0.72567679, 0.72160399, 0.72874137, 0.72759018, 0.72484651])

In [56]:
scores_pipe.mean()

0.7256917663975804

In [57]:
scores.mean()

0.7257838562617362

In [26]:
scores_pipe.max()

0.7287413660782809

In [33]:
pipe.fit(X[top_14_features], np.ravel(y))

Pipeline(steps=[('scaler', StandardScaler()),
                ('lgbm',
                 LGBMClassifier(boosting_type='goss', learning_rate=0.2,
                                min_child_samples=40, n_estimators=240,
                                num_leaves=120, random_state=42))])

In [53]:
make_submission_top_14_features(pipe, )

array([0.72567679, 0.72160399, 0.72874137, 0.72759018, 0.72484651])

# Load in Best Model

In [27]:
with open(MODEL_DIR / '02-02 GOSS random search LightGBM.pkl', 'rb') as f:
    best_model = pickle.load(f)



In [29]:
best_model.best_score_

0.7334968053159259

# Compare All Three Models

In [37]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5)

In [48]:
def run_skf_on_already_trained_models(model, skf, X, y):
    scores = []
    for train_index, test_index in skf.split(X, y):
        X_test = X.iloc[test_index]
        y_test = y[test_index]
        
        y_pred = model.predict(X_test)
        score = f1_score(y_test, y_pred, average='micro')
        scores.append(score)
    return scores

In [49]:
# Available models: model, pipe, best_model

In [50]:
run_skf_on_already_trained_models(model, skf, X[top_14_features],
                                 np.ravel(y))

[0.7874752978645843,
 0.7842670759785111,
 0.7903491941673062,
 0.7895241749808136,
 0.7882770529547198]

In [51]:
run_skf_on_already_trained_models(pipe, skf, X[top_14_features],
                                 np.ravel(y))

[0.7906601945473035,
 0.783672294704528,
 0.7909439754412894,
 0.7876247122026093,
 0.7881235610130468]

In [52]:
run_skf_on_already_trained_models(best_model, skf, X[top_14_features],
                                 np.ravel(y))

RuntimeError: The reset parameter is False but there is no n_features_in_ attribute. Is this estimator fitted?