# Creating Validation Set

It will be much faster to train the models if I can create a validation set that represents the test set. I want a validation set where if a model has a better score on it, the model will have a better score on the submissions.

I will aim to create a simple hold-out validation set for maximum speed (this has a high chance of working given that all the folds in cross-validation seem to give the same results anyway)

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import pickle
import lightgbm as lgb

from pathlib import Path
from lightgbm import LGBMClassifier
from pprint import pprint

from sklearn.metrics import mean_squared_error, f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split, RandomizedSearchCV 
from sklearn.model_selection import cross_val_score, StratifiedKFold



############ USE FOR GOOGLE COLAB ############
# DATA_DIR = Path('/content/drive/MyDrive/Work/Delivery/Current/Earthquake_damage/data')
# SUBMISSIONS_DIR = Path('drive/MyDrive/Work/Delivery/Current/Earthquake_damage/submissions')
# MODEL_DIR = Path('/content/drive/MyDrive/Work/Delivery/Current/Earthquake_damage/models')

# from google.colab import drive
# drive.mount('/content/drive')
#############################################


### USE FOR LOCAL JUPYTER NOTEBOOKS ###
DATA_DIR = Path('data')
SUBMISSIONS_DIR = Path('submissions')
MODEL_DIR = Path('models')
#######################################

# The code runs the same if working on Jupyter or Colab, just need to change the 
# dirs above

X = pd.read_csv(DATA_DIR / 'train_values.csv', index_col='building_id')

categorical_columns = X.select_dtypes(include='object').columns
bool_columns = [col for col in X.columns if col.startswith('has')]
X[categorical_columns] = X[categorical_columns].astype('category')
X[bool_columns] = X[bool_columns].astype('bool')

X = pd.get_dummies(X)
y = pd.read_csv(DATA_DIR / 'train_labels.csv', index_col='building_id')

In [2]:
sns.set()

In [3]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33,
                                                  random_state=42, stratify=y,
                                                  shuffle=True)

In [4]:
y_val.value_counts(normalize=True, sort=False)

damage_grade
1               0.096408
2               0.568914
3               0.334678
dtype: float64

In [5]:
y_train.value_counts(normalize=True, sort=False)

damage_grade
1               0.096408
2               0.568911
3               0.334681
dtype: float64

Very similar distribution of data. Good.

In [6]:
top_14_features = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id', 
                    'count_floors_pre_eq', 'age'	, 'area_percentage'	, 
                    'height_percentage', 
                    'has_superstructure_mud_mortar_stone',
                    'has_superstructure_stone_flag', 
                    'has_superstructure_mud_mortar_brick',
                    'has_superstructure_cement_mortar_brick',
                    'has_superstructure_timber', 'count_families',
                    'other_floor_type_q']

In [7]:
def calc_f1_score_X_val(model):
    y_pred = model.predict(X_val[top_14_features])
    return f1_score(y_val, y_pred, average='micro')

def calc_f1_score_X(model):
    y_pred = model.predict(X[top_14_features])
    return f1_score(y, y_pred, average='micro')

In [8]:
def init_clfs_dict():
    """
    Quickly initialize untrained versions of the classifiers (and their
    submission scores) in dict form to aid in model evaluation
    """
    LGBM_01_30_SUBMISSION_SCORE = 0.7397
    LGBM_02_02_SUBMISSION_SCORE = 0.7407
    LGBM_02_08_SUBMISSION_SCORE = 0.7335

    lgbm_01_30 = LGBMClassifier(boosting_type='goss', 
                                learning_rate=0.2, 
                                min_child_samples=40, 
                                n_estimators=330, 
                                num_leaves=90,
                               random_state=42)

    lgbm_02_02 = LGBMClassifier(boosting_type='goss', 
                                learning_rate=0.2, 
                                min_child_samples=40, 
                                n_estimators=240, 
                                num_leaves=120,
                                random_state=42)

    lgbm_02_08 = LGBMClassifier(boosting_type='goss',
                               learning_rate=0.1,
                               min_child_samples=70,
                               n_estimators=400,
                               num_leaves=120,
                               random_state=42)
    
    clf_dict = {'lgbm_01_30': [lgbm_01_30, LGBM_01_30_SUBMISSION_SCORE], 
                'lgbm_02_02': [lgbm_02_02, LGBM_02_02_SUBMISSION_SCORE],
                'lgbm_02_08': [lgbm_02_08, LGBM_02_08_SUBMISSION_SCORE]}
    
    return clf_dict

In [9]:
clfs = init_clfs_dict()

print('F1_SCORES - FIT_PREDICT USING X_TRAIN AND X_VAL')

for clf_name, clf_and_submission_score in clfs.items():
    clf = clf_and_submission_score[0]
    submission_score = clf_and_submission_score[1]
    
    clf.fit(X_train[top_14_features], np.ravel(y_train), verbose=10)
    y_pred = clf.predict(X_val[top_14_features])
    val_score = f1_score(y_val, y_pred, average='micro')
    
    print(f'{clf_name} val score (on X_val):'  , val_score)
    print(f'{clf_name} submission score:    ', submission_score, '\n')

F1_SCORES - FIT_PREDICT USING X_TRAIN AND X_VAL
lgbm_01_30 val score (on X_val): 0.7244502843056314
lgbm_01_30 submission score:     0.7397 

lgbm_02_02 val score (on X_val): 0.7247526134024814
lgbm_02_02 submission score:     0.7407 

lgbm_02_08 val score (on X_val): 0.7334736450423841
lgbm_02_08 submission score:     0.7335 



Clearly `lgbm_02_08` is overfitting the data (we could tell this perhaps from the huge number of estimators it is using). 

In [10]:
print('F1_SCORES - PREDICT USING X')

for clf_name, clf_and_submission_score in clfs.items():
    clf = clf_and_submission_score[0]
    submission_score = clf_and_submission_score[1]
    
    y_pred = clf.predict(X[top_14_features])
    X_val_score = f1_score(y, y_pred, average='micro')

    print(f'{clf_name} val score (on X):', X_val_score)
    print(f'{clf_name} submission score:', submission_score, '\n')

F1_SCORES - PREDICT USING X
lgbm_01_30 val score (on X): 0.7761942586559529
lgbm_01_30 submission score: 0.7397 

lgbm_02_02 val score (on X): 0.776355424576268
lgbm_02_02 submission score: 0.7407 

lgbm_02_08 val score (on X): 0.776186584088319
lgbm_02_08 submission score: 0.7335 



This is good! 

Now we see that if we make predictions on `X` as a whole, we see that the models now come out in the correct order. However, the difference between the models is slight and `lgbm_02_08` was only 0.00133 lower than `lgbm_02_02`. Moreover, `lgbm_02_02` is only 0.0001266 higher than `lgbm_01_30` which isn't ideal. What if we do cross-validation?

In [11]:
clfs = init_clfs_dict()

print('F1_SCORES - 10 FOLD STRATIFIED CROSS-VALIDATION')

for clf_name, clf_and_submission_score in clfs.items():
    clf = clf_and_submission_score[0]
    submission_score = clf_and_submission_score[1]
    
#     kfold = KFold(n_splits=10, random_state=7, shuffle=True)
    results = cross_val_score(clf, X[top_14_features], np.ravel(y),
                             cv=10, scoring='f1_micro', n_jobs=-1,
                             verbose=0)

    print(f'{clf_name} f1_score        : {results.mean():.4f} ({results.std():.3f})')
    print(f'{clf_name} submission score:', submission_score, '\n')

F1_SCORES - 10 FOLD STRATIFIED CROSS-VALIDATION
lgbm_01_30 f1_score        : 0.7269 (0.002)
lgbm_01_30 submission score: 0.7397 

lgbm_02_02 f1_score        : 0.7268 (0.003)
lgbm_02_02 submission score: 0.7407 

lgbm_02_08 f1_score        : 0.7355 (0.003)
lgbm_02_08 submission score: 0.7335 



In [12]:
calc_f1_score_X(lgbm_01_30)

NameError: name 'lgbm_01_30' is not defined

In [None]:
calc_f1_score_X(lgbm_02_02)

In [None]:
calc_f1_score_X(lgbm_02_08)

The above scores are BAD representations. Probably because they have been trained on this data. Let's try LightGBM with the above parameters but just train on the training data above. 

In [None]:
# clf_30 = LGBMClassifier(boosting_type='goss', learning_rate=0.2, 
#                         min_child_samples=40, n_estimators=330, num_leaves=90)

# clf_02 = LGBMClassifier(boosting_type='goss', learning_rate=0.2, 
#                         min_child_samples=40, n_estimators=240, num_leaves=120)

# clf_30.fit(X_train[top_14_features], np.ravel(y_train))
# clf_02.fit(X_train[top_14_features], np.ravel(y_train))

# print('30 F1 validation score:', calc_f1_score(clf_30))
# print('02 F1 validation score', calc_f1_score(clf_02))

Try doing KFold cross-val with 5 folds and seeing if this makes a difference.