# TL;DR

Both of these options work, try them both:
- Train on X, evaluate on X
- Train on X, evaluate on X_val

CV doesn't seem to be able to take into account overfitting caused by excessive hyperparameter tuning.

# Creating Validation Set

It will be much faster to train the models if I can create a validation set that represents the test set. I want a validation set where if a model has a better score on it, the model will have a better score on the submissions.

I will aim to create a simple hold-out validation set for maximum speed (this has a high chance of working given that all the folds in cross-validation seem to give the same results anyway)

In [12]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import pickle
import lightgbm as lgb

from pathlib import Path
from lightgbm import LGBMClassifier
from pprint import pprint

from sklearn.metrics import mean_squared_error, f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split, RandomizedSearchCV 
from sklearn.model_selection import cross_val_score, StratifiedKFold



############ USE FOR GOOGLE COLAB ############
# DATA_DIR = Path('/content/drive/MyDrive/Work/Delivery/Current/Earthquake_damage/data')
# SUBMISSIONS_DIR = Path('drive/MyDrive/Work/Delivery/Current/Earthquake_damage/submissions')
# MODEL_DIR = Path('/content/drive/MyDrive/Work/Delivery/Current/Earthquake_damage/models')

# from google.colab import drive
# drive.mount('/content/drive')
#############################################


### USE FOR LOCAL JUPYTER NOTEBOOKS ###
DATA_DIR = Path('data')
SUBMISSIONS_DIR = Path('submissions')
MODEL_DIR = Path('models')
#######################################

# The code runs the same if working on Jupyter or Colab, just need to change the 
# dirs above

X = pd.read_csv(DATA_DIR / 'train_values.csv', index_col='building_id')

categorical_columns = X.select_dtypes(include='object').columns
bool_columns = [col for col in X.columns if col.startswith('has')]
X[categorical_columns] = X[categorical_columns].astype('category')
X[bool_columns] = X[bool_columns].astype('bool')

X = pd.get_dummies(X)
y = pd.read_csv(DATA_DIR / 'train_labels.csv', index_col='building_id')

In [13]:
sns.set()

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33,
                                                  random_state=42, stratify=y,
                                                  shuffle=True)

In [6]:
y_val.value_counts(normalize=True, sort=False)

damage_grade
1               0.096408
2               0.568914
3               0.334678
dtype: float64

In [7]:
y_train.value_counts(normalize=True, sort=False)

damage_grade
1               0.096408
2               0.568911
3               0.334681
dtype: float64

Very similar distribution of data. Good.

In [15]:
top_14_features = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id', 
                    'count_floors_pre_eq', 'age'	, 'area_percentage'	, 
                    'height_percentage', 
                    'has_superstructure_mud_mortar_stone',
                    'has_superstructure_stone_flag', 
                    'has_superstructure_mud_mortar_brick',
                    'has_superstructure_cement_mortar_brick',
                    'has_superstructure_timber', 'count_families',
                    'other_floor_type_q']

In [16]:
def calc_f1_score_X_val(model):
    y_pred = model.predict(X_val[top_14_features])
    return f1_score(y_val, y_pred, average='micro')

def calc_f1_score_X(model):
    y_pred = model.predict(X[top_14_features])
    return f1_score(y, y_pred, average='micro')

In [17]:
def init_clfs_dict():
    """
    Quickly initialize untrained versions of the classifiers (and their
    submission scores) in dict form to aid in model evaluation
    """
    LGBM_01_30_SUBMISSION_SCORE = 0.7397
    LGBM_02_02_SUBMISSION_SCORE = 0.7407
    LGBM_02_08_SUBMISSION_SCORE = 0.7335

    lgbm_01_30 = LGBMClassifier(boosting_type='goss', 
                                learning_rate=0.2, 
                                min_child_samples=40, 
                                n_estimators=330, 
                                num_leaves=90,
                               random_state=42)

    lgbm_02_02 = LGBMClassifier(boosting_type='goss', 
                                learning_rate=0.2, 
                                min_child_samples=40, 
                                n_estimators=240, 
                                num_leaves=120,
                                random_state=42)

    lgbm_02_08 = LGBMClassifier(boosting_type='goss',
                               learning_rate=0.1,
                               min_child_samples=70,
                               n_estimators=400,
                               num_leaves=120,
                               random_state=42)
    
    clf_dict = {'lgbm_01_30': [lgbm_01_30, LGBM_01_30_SUBMISSION_SCORE], 
                'lgbm_02_02': [lgbm_02_02, LGBM_02_02_SUBMISSION_SCORE],
                'lgbm_02_08': [lgbm_02_08, LGBM_02_08_SUBMISSION_SCORE]}
    
    return clf_dict

In [9]:
clfs = init_clfs_dict()

print('F1_SCORES - FIT_PREDICT USING X_TRAIN AND X_VAL')

for clf_name, clf_and_submission_score in clfs.items():
    clf = clf_and_submission_score[0]
    submission_score = clf_and_submission_score[1]
    
    clf.fit(X_train[top_14_features], np.ravel(y_train), verbose=10)
    y_pred = clf.predict(X_val[top_14_features])
    val_score = f1_score(y_val, y_pred, average='micro')
    
    print(f'{clf_name} val score (on X_val):'  , val_score)
    print(f'{clf_name} submission score:    ', submission_score, '\n')

F1_SCORES - FIT_PREDICT USING X_TRAIN AND X_VAL
lgbm_01_30 val score (on X_val): 0.7244502843056314
lgbm_01_30 submission score:     0.7397 

lgbm_02_02 val score (on X_val): 0.7247526134024814
lgbm_02_02 submission score:     0.7407 

lgbm_02_08 val score (on X_val): 0.7334736450423841
lgbm_02_08 submission score:     0.7335 



Clearly `lgbm_02_08` is overfitting the data (we could tell this perhaps from the huge number of estimators it is using). 

In [10]:
print('F1_SCORES - PREDICT USING X')

for clf_name, clf_and_submission_score in clfs.items():
    clf = clf_and_submission_score[0]
    submission_score = clf_and_submission_score[1]
    
    y_pred = clf.predict(X[top_14_features])
    X_val_score = f1_score(y, y_pred, average='micro')

    print(f'{clf_name} val score (on X):', X_val_score)
    print(f'{clf_name} submission score:', submission_score, '\n')

F1_SCORES - PREDICT USING X
lgbm_01_30 val score (on X): 0.7761942586559529
lgbm_01_30 submission score: 0.7397 

lgbm_02_02 val score (on X): 0.776355424576268
lgbm_02_02 submission score: 0.7407 

lgbm_02_08 val score (on X): 0.776186584088319
lgbm_02_08 submission score: 0.7335 



This is good! 

If we train on `X_train` and evaluate on on X, this seems to be true indication...

What if we train on X and evaluate on X_val or train on X and evaluate on X?

In [20]:
# Train on X, evaluate on X_val
clfs = init_clfs_dict()

print('Train on X, evaluate on X_val'.upper())

for clf_name, clf_and_submission_score in clfs.items():
    clf = clf_and_submission_score[0]
    submission_score = clf_and_submission_score[1]
    
    clf.fit(X[top_14_features], np.ravel(y), verbose=10)
    
    y_pred = clf.predict(X_val[top_14_features])
    val_score = f1_score(y_val, y_pred, average='micro')
    
    print(f'{clf_name} val score (on X_val):'  , val_score)
    print(f'{clf_name} submission score:    ', submission_score, '\n')

TRAIN ON X, EVALUATE ON X_VAL
lgbm_01_30 val score (on X_val): 0.7890673147362179
lgbm_01_30 submission score:     0.7397 

lgbm_02_02 val score (on X_val): 0.7891370829893372
lgbm_02_02 submission score:     0.7407 

lgbm_02_08 val score (on X_val): 0.785823090966174
lgbm_02_08 submission score:     0.7335 



GOOD! Now we see that the score on X_val is lower for `lgbm_02_08` than for the other ones and we also see that the eval scores match up perfectly with the submission scores (though they are skewed slightly up). But I know that anything above 0.789 will be an improvement.

In [21]:
# Train on X and evaluate on X
clfs = init_clfs_dict()

print('Train on X, evaluate on X'.upper())

for clf_name, clf_and_submission_score in clfs.items():
    clf = clf_and_submission_score[0]
    submission_score = clf_and_submission_score[1]
    
    clf.fit(X[top_14_features], np.ravel(y), verbose=10)
    
    y_pred = clf.predict(X[top_14_features])
    val_score = f1_score(y, y_pred, average='micro')
    
    print(f'{clf_name} val score (on X_val):'  , val_score)
    print(f'{clf_name} submission score:    ', submission_score, '\n')

TRAIN ON X, EVALUATE ON X
lgbm_01_30 val score (on X_val): 0.7870499345743109
lgbm_01_30 submission score:     0.7397 

lgbm_02_02 val score (on X_val): 0.7879785572580305
lgbm_02_02 submission score:     0.7407 

lgbm_02_08 val score (on X_val): 0.7849279166234973
lgbm_02_08 submission score:     0.7335 



This also works fine too. If we train on X and evaluate on X, we can see a clear improvement. Weird that if we train on X_train and evaluate X_val it is different. 

In [30]:
clfs = init_clfs_dict()

print('F1_SCORES - 5 FOLD STRATIFIED CROSS-VALIDATION')

for clf_name, clf_and_submission_score in clfs.items():
    clf = clf_and_submission_score[0]
    submission_score = clf_and_submission_score[1]
    
#     kfold = KFold(n_splits=10, random_state=7, shuffle=True)
    results = cross_val_score(clf, X[top_14_features], np.ravel(y),
                             cv=5, scoring='f1_micro', n_jobs=-1,
                             verbose=0)

    print(f'{clf_name} f1_score        : {results.mean()} ({results.std():.4f})')
    print(f'{clf_name} submission score:', submission_score, '\n')

F1_SCORES - 5 FOLD STRATIFIED CROSS-VALIDATION
lgbm_01_30 f1_score        : 0.7255344373046112 (0.0029)
lgbm_01_30 submission score: 0.7397 

lgbm_02_02 f1_score        : 0.7257838562617362 (0.0021)
lgbm_02_02 submission score: 0.7407 

lgbm_02_08 f1_score        : 0.7339150593305502 (0.0038)
lgbm_02_08 submission score: 0.7335 



Stratifed KFold Cross-Validation is not a good predictor of model performance for this dataset or this problem. It does not pick up that `lgbm_02_08` overfits the data. 

## Using Different train_test_split percentages

See if using different sizes of validation splits with `train_test_split` leads to results in line with the leaderboard. It does not. 

None of the validation split sizes give results that reflect the leaderboard. Indeed, they often perfectly reflect the reverse leaderboard scores... how is this the case?

In [22]:
def train_val_split(val_size=0.33):
    return train_test_split(X, y, test_size=val_size,
                            random_state=42, stratify=y,
                            shuffle=True)

In [27]:
def train_and_eval_custom_val_split_size(val_size):
    
    X_train, X_val, y_train, y_val = train_val_split(val_size)
    
    clfs = init_clfs_dict()

    print(f'Validation split: {val_size}')

    for clf_name, clf_and_submission_score in clfs.items():
        clf = clf_and_submission_score[0]
        submission_score = clf_and_submission_score[1]

        clf.fit(X_train[top_14_features], np.ravel(y_train), verbose=10)

        y_pred = clf.predict(X_val[top_14_features])
        val_score = f1_score(y_val, y_pred, average='micro')

        print(f'{clf_name} val score (on X_val):'  , val_score)
        print(f'{clf_name} submission score:    ', submission_score, '\n')

In [28]:
val_splits = [0.1, 0.2, 0.25, 0.3, 0.33, 0.4, 0.5]
for val_split in val_splits:
    train_and_eval_custom_val_split_size(val_split)

Validation split: 0.1
lgbm_01_30 val score (on X_val): 0.7275238862668356
lgbm_01_30 submission score:     0.7397 

lgbm_02_02 val score (on X_val): 0.7272936571889029
lgbm_02_02 submission score:     0.7407 

lgbm_02_08 val score (on X_val): 0.7363493342542495
lgbm_02_08 submission score:     0.7335 

Validation split: 0.2
lgbm_01_30 val score (on X_val): 0.7269814470175169
lgbm_01_30 submission score:     0.7397 

lgbm_02_02 val score (on X_val): 0.7261372575353504
lgbm_02_02 submission score:     0.7407 

lgbm_02_08 val score (on X_val): 0.7359605533278333
lgbm_02_08 submission score:     0.7335 

Validation split: 0.25
lgbm_01_30 val score (on X_val): 0.7272797040720789
lgbm_01_30 submission score:     0.7397 

lgbm_02_02 val score (on X_val): 0.7237954904759712
lgbm_02_02 submission score:     0.7407 

lgbm_02_08 val score (on X_val): 0.7369802458903163
lgbm_02_08 submission score:     0.7335 

Validation split: 0.3
lgbm_01_30 val score (on X_val): 0.7256494544710351
lgbm_01_30 su