# Mac CPU Test

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from pathlib import Path
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier
import pickle
from pprint import pprint

############ USE FOR GOOGLE COLAB ############
# DATA_DIR = Path('/content/drive/MyDrive/Work/Delivery/Current/Earthquake_damage/data')
# SUBMISSIONS_DIR = Path('drive/MyDrive/Work/Delivery/Current/Earthquake_damage/submissions'
# MODEL_DIR = Path('/content/drive/MyDrive/Work/Delivery/Current/Earthquake_damage/models')

# from google.colab import drive
# drive.mount('/content/drive')
#############################################


### USE FOR LOCAL JUPYTER NOTEBOOKS ###
DATA_DIR = Path('data')
SUBMISSIONS_DIR = Path('submissions')
MODEL_DIR = Path('models')
#######################################

# The code runs the same if working on Jupyter or Colab, just need to change the 
# dirs above

X = pd.read_csv(DATA_DIR / 'train_values.csv', index_col='building_id')

categorical_columns = X.select_dtypes(include='object').columns
bool_columns = [col for col in X.columns if col.startswith('has')]
X[categorical_columns] = X[categorical_columns].astype('category')
X[bool_columns] = X[bool_columns].astype('bool')

X = pd.get_dummies(X)
y = pd.read_csv(DATA_DIR / 'train_labels.csv', index_col='building_id')

In [2]:
sns.set()

In [3]:
most_important_features = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id', 
                           'count_floors_pre_eq', 'age'	, 'area_percentage'	, 
                           'height_percentage', 
                           'has_superstructure_mud_mortar_stone',
                           'has_superstructure_stone_flag', 
                           'has_superstructure_mud_mortar_brick',
                           'has_superstructure_cement_mortar_brick',
                           'has_superstructure_timber', 'count_families',
                           'other_floor_type_q']

In [4]:
def make_submission_top_14_features(pipeline, title):
    """
    Given a trained pipeline object, use it to make predictions on the 
    submission test set 'test_values.csv' and write them a csv in the submissions
    folder.
    """
    # Read in test_values csv and apply data preprocessing
    # note: will create a data preprocessing pipeline or function in future
    test_values = pd.read_csv(DATA_DIR / 'test_values.csv', index_col='building_id')
    test_values[categorical_columns] = test_values[categorical_columns].astype('category')
    test_values[bool_columns] = test_values[bool_columns].astype('bool')
    test_values = pd.get_dummies(test_values)
    test_values = test_values[most_important_features]

    # Generate predictions using pipeline we pass in
    predictions = pipeline.predict(test_values)

    submission_format = pd.read_csv(DATA_DIR / 'submission_format.csv',
                                    index_col='building_id')

    my_submission = pd.DataFrame(data=predictions,
                                columns=submission_format.columns,
                                index=submission_format.index)
    
    my_submission.to_csv(SUBMISSIONS_DIR / f'{title}.csv')

In [5]:
class_weights = {1: 0.1,
                 2: 0.57,
                 3: 0.33}

In [6]:
steps = [('scaler', StandardScaler()),
         ('lgbm', LGBMClassifier(random_state=42, boosting_type='goss',
                                 class_weight=class_weights))]

pipe = Pipeline(steps)

In [7]:
param_dist = {'lgbm__n_estimators': np.arange(200, 410, 10),
              'lgbm__num_leaves': np.arange(60, 130, 10),
#               'lgbm__boosting_type': ['goss'],
              'lgbm__learning_rate': [0.1, 0.2, 0.25, 0.3],
              'lgbm__min_child_samples': np.arange(30, 110, 10)}

I want to check how things change with lower number of estimators and how that is impacted by the learning rate e.g. set num estimators to 50 and learning rate to 5 or (100 and 2).

In [8]:
rs = RandomizedSearchCV(pipe, param_dist, n_iter=2500, cv=2, verbose=10, 
                        n_jobs=-1, scoring='f1_micro', random_state=42)

In [9]:
rs.fit(X[most_important_features], np.ravel(y))

Fitting 2 folds for each of 2500 candidates, totalling 5000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 14.1min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed: 17.4min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed: 20.9min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed: 23.8min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 27.0min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed: 30.3min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 34.1min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed: 38.4min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 41

RandomizedSearchCV(cv=2,
                   estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                             ('lgbm',
                                              LGBMClassifier(boosting_type='goss',
                                                             class_weight={1: 0.1,
                                                                           2: 0.57,
                                                                           3: 0.33},
                                                             random_state=42))]),
                   n_iter=2500, n_jobs=-1,
                   param_distributions={'lgbm__learning_rate': [0.1, 0.2, 0.25,
                                                                0.3],
                                        'lgbm__min_child_samples': array([ 30,  40,  50,  60,  70,  80,  90, 100]),
                                        'lgbm__n_estimators': array([200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 3

In [10]:
# Using print isn't as pretty
rs.best_params_

{'lgbm__num_leaves': 120,
 'lgbm__n_estimators': 400,
 'lgbm__min_child_samples': 70,
 'lgbm__learning_rate': 0.1}

In [11]:
print('Best score:', rs.best_score_)

y_pred = rs.predict(X[most_important_features])
f1 = f1_score(y, y_pred, average='micro')
print('F1 score entire dataset:', f1)

Best score: 0.714406314099953
F1 score entire dataset: 0.7507453923814568


In [12]:
print('Creating submission csv...')
make_submission_top_14_features(rs, '0202 LightGBM GOSS Random Search')

print('Writing model to hard drive...')
pkl_filename = MODEL_DIR / '0202 LightGBM GOSS Random Search.pkl'
with open(pkl_filename, 'wb') as f:
    pickle.dump(rs, f)

print('Finished')

Creating submission csv...
Writing model to hard drive...
Finished
