# Mean and Guessing Models

Let's build some models that just guess the mean and use smart/weighted guesses on the test set. 

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import pickle
import lightgbm as lgb

from pathlib import Path
from lightgbm import LGBMClassifier
from pprint import pprint

from sklearn.metrics import mean_squared_error, f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split, RandomizedSearchCV 
from sklearn.model_selection import cross_val_score, StratifiedKFold

############ USE FOR GOOGLE COLAB ############
# DATA_DIR = Path('/content/drive/MyDrive/Work/Delivery/Current/Earthquake_damage/data')
# SUBMISSIONS_DIR = Path('drive/MyDrive/Work/Delivery/Current/Earthquake_damage/submissions')
# MODEL_DIR = Path('/content/drive/MyDrive/Work/Delivery/Current/Earthquake_damage/models')

# from google.colab import drive
# drive.mount('/content/drive')
#############################################


### USE FOR LOCAL JUPYTER NOTEBOOKS ###
DATA_DIR = Path('data')
SUBMISSIONS_DIR = Path('submissions')
MODEL_DIR = Path('models')
#######################################

# The code runs the same if working on Jupyter or Colab, just need to change the 
# dirs above

X = pd.read_csv(DATA_DIR / 'train_values.csv', index_col='building_id')

categorical_columns = X.select_dtypes(include='object').columns
bool_columns = [col for col in X.columns if col.startswith('has')]
X[categorical_columns] = X[categorical_columns].astype('category')
X[bool_columns] = X[bool_columns].astype('bool')

X = pd.get_dummies(X)
y = pd.read_csv(DATA_DIR / 'train_labels.csv', index_col='building_id')

In [2]:
sns.set()

In [3]:
submission_format = pd.read_csv(DATA_DIR / 'submission_format.csv',
                                index_col='building_id')

test_values = pd.read_csv(DATA_DIR / 'test_values.csv', index_col='building_id')

In [4]:
def submission():
    submission_format = pd.read_csv(DATA_DIR / 'submission_format.csv',
                                    index_col='building_id')

    my_submission = pd.DataFrame(data=predictions,
                                columns=submission_format.columns,
                                index=submission_format.index)
    
    my_submission.to_csv(SUBMISSIONS_DIR / f'{title}.csv')

In [14]:
# Just guessing 2
all_twos = np.ones(len(submission_format), dtype=np.int8) + 1

df_all_twos = pd.DataFrame(data=all_twos,
                           columns=submission_format.columns,
                           index=submission_format.index)

df_all_twos.to_csv(SUBMISSIONS_DIR / 'all_twos.csv')

## Weighted Guessing

We can see taht we have 10% of houses being damage grade 1, 57% being grade 2 and 33% are grade 3. So, let's create a classifier that guesses these options this percentage of the time.

In [6]:
y.value_counts(sort=False, normalize=True)

damage_grade
1               0.096408
2               0.568912
3               0.334680
dtype: float64

In [16]:
len(submission_format)

86868

In [17]:
# Weighted guessing
num_1 = int(len(submission_format) * 0.1)
# Add extra 2 as this is the modal class
num_2 = int(len(submission_format) * 0.57) + 2 
num_3 = int(len(submission_format) * 0.33)

In [18]:
num_1 + num_2 + num_3 == len(submission_format)

True

In [19]:
num_1 + num_2 + num_3

86868

In [25]:
num_1

8686

In [31]:
weighted_guess = [1 for _ in range(num_1)] + \
                 [2 for _ in range(num_2)] + \
                 [3 for _ in range(num_3)]
np.random.shuffle(weighted_guess)

In [33]:
df_weighted_guess = pd.DataFrame(data=weighted_guess,
                                columns=submission_format.columns,
                                index=submission_format.index)

df_weighted_guess.to_csv(SUBMISSIONS_DIR / 'weighted_guess.csv')

## Calculating F1 Score on X

In [38]:
ALL_TWO_SUBMISSION_SCORE = 0.5670

all_twos = [2 for _ in range(len(X))]
all_two_score = f1_score(y, all_twos, average='micro')
print('F1 score on X:         ', all_two_score)
print('F1 score on submission:', ALL_TWO_SUBMISSION_SCORE)

F1 score on X:          0.5689118614280068
F1 score on submission: 0.567


In [39]:
def make_weighted_guess(data):
    num_1 = int(len(data) * 0.1)
    num_2 = int(len(data) * 0.57)
    num_3 = int(len(data) * 0.33)
    
    # Above will not be perfectly equal to len(data), so
    # add the difference to modal class 2
    diff = len(data) - num_1 - num_2 - num_3
    num_2 += diff
    
    assert len(data) == num_1 + num_2 + num_3
    
    weighted_guess = [1 for _ in range(num_1)] + \
                     [2 for _ in range(num_2)] + \
                     [3 for _ in range(num_3)]
    
    np.random.shuffle(weighted_guess)
    
    return weighted_guess

In [40]:
WEIGHTED_GUESS_SUBMISSION_SCORE = 0.4441

weighted_pred = make_weighted_guess(X)
weighted_score = f1_score(y, weighted_pred, average='micro')
print('F1 score on X:         ', weighted_score)
print('F1 score on submission:', WEIGHTED_GUESS_SUBMISSION_SCORE)

F1 score on X:          0.44396222577810524
F1 score on submission: 0.4441


## Calculating F1 Score on X_Val

In [41]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20,
                                                 random_state=42, stratify=y)

In [42]:
all_twos_X_val = [2 for _ in range(len(X_val))]
all_two_X_val_score = f1_score(y_val, all_twos_X_val, average='micro')
print('F1 score on X:         ', all_two_X_val_score)
print('F1 score on submission:', ALL_TWO_SUBMISSION_SCORE)

F1 score on X:          0.5689069664818404
F1 score on submission: 0.567


In [43]:
weighted_pred_X_val = make_weighted_guess(X_val)
weighted_score_X_val = f1_score(y_val, weighted_pred_X_val, average='micro')
print('F1 score on X:         ', weighted_score_X_val)
print('F1 score on submission:', WEIGHTED_GUESS_SUBMISSION_SCORE)

F1 score on X:          0.4440244814949828
F1 score on submission: 0.4441


As expected, the scores on X and X_val are almost identical to the actual scores obtained upon submission.