In [1]:
import lightgbm as lgb
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import pickle
from pathlib import Path
from tqdm.notebook import trange, tqdm

### USE FOR LOCAL JUPYTER NOTEBOOKS ###
DOWNLOAD_DIR = Path('../download')
DATA_DIR = Path('../data')
SUBMISSIONS_DIR = Path('../submissions')
MODEL_DIR = Path('../models')
#######################################

##### GOOGLE COLAB ######
# DOWNLOAD_DIR = Path('/content/drive/MyDrive/Work/Delivery/Current/earthquake_damage_competition/download')
# SUBMISSIONS_DIR = Path('/content/drive/MyDrive/Work/Delivery/Current/earthquake_damage_competition/submissions')
# DATA_DIR = Path('/content/drive/MyDrive/Work/Delivery/Current/earthquake_damage_competition/data')
# MODEL_DIR = Path('/content/drive/MyDrive/Work/Delivery/Current/earthquake_damage_competition/model')
########################

X = pd.read_csv(DOWNLOAD_DIR / 'train_values.csv', index_col='building_id')
categorical_columns = X.select_dtypes(include='object').columns
bool_columns = [col for col in X.columns if col.startswith('has')]

X_test = pd.read_csv(DOWNLOAD_DIR / 'test_values.csv', index_col='building_id')
y = pd.read_csv(DOWNLOAD_DIR / 'train_labels.csv', index_col='building_id')

In [2]:
sns.set()

In [3]:
import wandb
wandb.login()

True

In [4]:
X_test.shape

(86868, 38)

In [5]:
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

label_enc = LabelEncoder()

t = [('ord_encoder', OrdinalEncoder(dtype=int), categorical_columns)]
ct = ColumnTransformer(transformers=t, remainder='passthrough')

In [6]:
X_all_ints = ct.fit_transform(X)
y = label_enc.fit_transform(np.ravel(y))

In [7]:
# Note that append for pandas objects works differently to append with
# python objects e.g. python append modifes the list in-place
# pandas append returns a new object, leaving the original unmodified
not_categorical_columns = X.select_dtypes(exclude='object').columns
cols_ordered_after_ordinal_encoding = categorical_columns.append(not_categorical_columns)

In [8]:
geo_cols = pd.Index(['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id'])
cat_cols_plus_geo = categorical_columns.append(geo_cols)

In [9]:
train_data = lgb.Dataset(X_all_ints,
                        label=y,
                        feature_name=list(cols_ordered_after_ordinal_encoding),
                        categorical_feature=list(cat_cols_plus_geo))

In [10]:
# Taken from the docs for lgb.train and lgb.cv
# Helpful Stackoverflow answer: 
# https://stackoverflow.com/questions/50931168/f1-score-metric-in-lightgbm
from sklearn.metrics import f1_score

def get_ith_pred(preds, i, num_data, num_class):
    """
    preds: 1D NumPY array
        A 1D numpy array containing predicted probabilities. Has shape
        (num_data * num_class,). So, For binary classification with 
        100 rows of data in your training set, preds is shape (200,), 
        i.e. (100 * 2,).
    i: int
        The row/sample in your training data you wish to calculate
        the prediction for.
    num_data: int
        The number of rows/samples in your training data
    num_class: int
        The number of classes in your classification task.
        Must be greater than 2.
    
    
    LightGBM docs tell us that to get the probability of class 0 for 
    the 5th row of the dataset we do preds[0 * num_data + 5].
    For class 1 prediction of 7th row, do preds[1 * num_data + 7].
    
    sklearn's f1_score(y_true, y_pred) expects y_pred to be of the form
    [0, 1, 1, 1, 1, 0...] and not probabilities.
    
    This function translates preds into the form sklearn's f1_score 
    understands.
    """
    # Does not work for binary classification, preds has a different form
    # in that case
    assert num_class > 2
    
    preds_for_ith_row = [preds[class_label * num_data + i]
                        for class_label in range(num_class)]
    
    # The element with the highest probability is predicted
    return np.argmax(preds_for_ith_row)
    
def lgb_f1_micro(preds, train_data):
    y_true = train_data.get_label()
    
    num_data = len(y_true)
    num_class = 3
    
    y_pred = []
    for i in range(num_data):
        ith_pred = get_ith_pred(preds, i, num_data, num_class)
        y_pred.append(ith_pred)
    
    return 'f1', f1_score(y_true, y_pred, average='micro'), True

In [11]:
def get_train_val_datasets(X, y, train_idx, val_idx):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    train_dataset = lgb.Dataset(X_train, label=y_train, free_raw_data=False)
    val_dataset = lgb.Dataset(X_val, label=y_val, free_raw_data=False)
    return train_dataset, val_dataset

In [12]:
import lightgbm as lgb
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import pickle
from pathlib import Path
from tqdm.notebook import trange, tqdm
from sklearn.model_selection import StratifiedKFold

### USE FOR LOCAL JUPYTER NOTEBOOKS ###
DOWNLOAD_DIR = Path('../download')
DATA_DIR = Path('../data')
SUBMISSIONS_DIR = Path('../submissions')
MODEL_DIR = Path('../models')
#######################################

##### GOOGLE COLAB ######
# DOWNLOAD_DIR = Path('/content/drive/MyDrive/Work/Delivery/Current/earthquake_damage_competition/download')
# SUBMISSIONS_DIR = Path('/content/drive/MyDrive/Work/Delivery/Current/earthquake_damage_competition/submissions')
# DATA_DIR = Path('/content/drive/MyDrive/Work/Delivery/Current/earthquake_damage_competition/data')
# MODEL_DIR = Path('/content/drive/MyDrive/Work/Delivery/Current/earthquake_damage_competition/model')
########################

X = pd.read_csv(DOWNLOAD_DIR / 'train_values.csv', index_col='building_id')
categorical_columns = X.select_dtypes(include='object').columns
bool_columns = [col for col in X.columns if col.startswith('has')]

X_test = pd.read_csv(DOWNLOAD_DIR / 'test_values.csv', index_col='building_id')
y = pd.read_csv(DOWNLOAD_DIR / 'train_labels.csv', index_col='building_id')

In [13]:
import lightgbm as lgb
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import pickle
from pathlib import Path
from tqdm.notebook import trange, tqdm
from sklearn.model_selection import StratifiedKFold

### USE FOR LOCAL JUPYTER NOTEBOOKS ###
DOWNLOAD_DIR = Path('../download')
DATA_DIR = Path('../data')
SUBMISSIONS_DIR = Path('../submissions')
MODEL_DIR = Path('../models')
#######################################

##### GOOGLE COLAB ######
# DOWNLOAD_DIR = Path('/content/drive/MyDrive/Work/Delivery/Current/earthquake_damage_competition/download')
# SUBMISSIONS_DIR = Path('/content/drive/MyDrive/Work/Delivery/Current/earthquake_damage_competition/submissions')
# DATA_DIR = Path('/content/drive/MyDrive/Work/Delivery/Current/earthquake_damage_competition/data')
# MODEL_DIR = Path('/content/drive/MyDrive/Work/Delivery/Current/earthquake_damage_competition/model')
########################

X = pd.read_csv(DOWNLOAD_DIR / 'train_values.csv', index_col='building_id')
categorical_columns = X.select_dtypes(include='object').columns
bool_columns = [col for col in X.columns if col.startswith('has')]

X_test = pd.read_csv(DOWNLOAD_DIR / 'test_values.csv', index_col='building_id')
y = pd.read_csv(DOWNLOAD_DIR / 'train_labels.csv', index_col='building_id')

In [14]:
sns.set()

In [15]:
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

label_enc = LabelEncoder()

t = [('ord_encoder', OrdinalEncoder(dtype=int), categorical_columns)]
ct = ColumnTransformer(transformers=t, remainder='passthrough')

In [16]:
X_all_ints = ct.fit_transform(X)
y = label_enc.fit_transform(np.ravel(y))

In [17]:
# Note that append for pandas objects works differently to append with
# python objects e.g. python append modifes the list in-place
# pandas append returns a new object, leaving the original unmodified
not_categorical_columns = X.select_dtypes(exclude='object').columns
cols_ordered_after_ordinal_encoding = categorical_columns.append(not_categorical_columns)

In [18]:
geo_cols = pd.Index(['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id'])
cat_cols_plus_geo = categorical_columns.append(geo_cols)

In [19]:
train_data = lgb.Dataset(X_all_ints,
                        label=y,
                        feature_name=list(cols_ordered_after_ordinal_encoding),
                        categorical_feature=list(cat_cols_plus_geo))

In [20]:
# Taken from the docs for lgb.train and lgb.cv
# Helpful Stackoverflow answer: 
# https://stackoverflow.com/questions/50931168/f1-score-metric-in-lightgbm
from sklearn.metrics import f1_score

def get_ith_pred(preds, i, num_data, num_class):
    """
    preds: 1D NumPY array
        A 1D numpy array containing predicted probabilities. Has shape
        (num_data * num_class,). So, For binary classification with 
        100 rows of data in your training set, preds is shape (200,), 
        i.e. (100 * 2,).
    i: int
        The row/sample in your training data you wish to calculate
        the prediction for.
    num_data: int
        The number of rows/samples in your training data
    num_class: int
        The number of classes in your classification task.
        Must be greater than 2.
    
    
    LightGBM docs tell us that to get the probability of class 0 for 
    the 5th row of the dataset we do preds[0 * num_data + 5].
    For class 1 prediction of 7th row, do preds[1 * num_data + 7].
    
    sklearn's f1_score(y_true, y_pred) expects y_pred to be of the form
    [0, 1, 1, 1, 1, 0...] and not probabilities.
    
    This function translates preds into the form sklearn's f1_score 
    understands.
    """
    # Does not work for binary classification, preds has a different form
    # in that case
    assert num_class > 2
    
    preds_for_ith_row = [preds[class_label * num_data + i]
                        for class_label in range(num_class)]
    
    # The element with the highest probability is predicted
    return np.argmax(preds_for_ith_row)
    
def lgb_f1_micro(preds, train_data):
    y_true = train_data.get_label()
    
    num_data = len(y_true)
    num_class = 3
    
    y_pred = []
    for i in range(num_data):
        ith_pred = get_ith_pred(preds, i, num_data, num_class)
        y_pred.append(ith_pred)
    
    return 'f1', f1_score(y_true, y_pred, average='micro'), True

In [21]:
skf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)

# Cross-validation loop
for train_idx, val_idx in skf.split(X_all_ints, y):
    print(type(val_idx), len(val_idx))
    train_dataset, val_dataset = get_train_val_datasets(X_all_ints, y
                                                       train_idx, val_idx)
    bagged_preds = np.zeroes(len(val_idx))

In [22]:
skf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)

# Cross-validation loop
for train_idx, val_idx in skf.split(X_all_ints, y):
    print(type(val_idx), len(val_idx))
    train_dataset, val_dataset = get_train_val_datasets(X_all_ints, y,
                                                       train_idx, val_idx)
    bagged_preds = np.zeroes(len(val_idx))

In [23]:
skf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)

# Cross-validation loop
for train_idx, val_idx in skf.split(X_all_ints, y):
    print(type(val_idx), len(val_idx))
    train_dataset, val_dataset = get_train_val_datasets(X_all_ints, y,
                                                       train_idx, val_idx)
    bagged_preds = np.zeros(len(val_idx))

In [24]:
skf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)

# Cross-validation loop
for train_idx, val_idx in skf.split(X_all_ints, y):
    print(type(val_idx), len(val_idx), val_idx.shape)
    train_dataset, val_dataset = get_train_val_datasets(X_all_ints, y,
                                                       train_idx, val_idx)
    bagged_preds = np.zeros(len(val_idx))

In [25]:
skf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)

# Cross-validation loop
for train_idx, val_idx in skf.split(X_all_ints, y):
    print(type(val_idx), len(val_idx), val_idx.shape)
    train_dataset, val_dataset = get_train_val_datasets(X_all_ints, y,
                                                       train_idx, val_idx)
    # Perform bagged model building and evaluation to get a score
    print(val_dataset.num_data())
    

In [26]:
def get_train_val_datasets(X, y, train_idx, val_idx):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    train_dataset = lgb.Dataset(X_train, label=y_train, free_raw_data=False)
    val_dataset = lgb.Dataset(X_val, label=y_val, free_raw_data=False)
    train_dataset.construct()
    val_dataset.construct()
    return train_dataset, val_dataset


def eval_bagged_model(config, num_bags, train_dataset, val_dataset):
    bagged_preds = np.zeros

In [27]:
skf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)

# Cross-validation loop
for train_idx, val_idx in skf.split(X_all_ints, y):
    print(type(val_idx), len(val_idx), val_idx.shape)
    train_dataset, val_dataset = get_train_val_datasets(X_all_ints, y,
                                                       train_idx, val_idx)
    # Perform bagged model building and evaluation to get a score
    print(val_dataset.num_data())
    

In [28]:
def get_train_val_datasets(X, y, train_idx, val_idx):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    train_dataset = lgb.Dataset(X_train, label=y_train, free_raw_data=False)
    val_dataset = lgb.Dataset(X_val, label=y_val, free_raw_data=False)
    train_dataset.construct()
    val_dataset.construct()
    return train_dataset, val_dataset


def train_lgbm_model(config, train_dataset, val_dataset):
        evals_result = {}
        booster = lgb.train(config,
                           train_dataset,
                           valid_sets=[train_dataset, val_dataset],
                           valid_names=['train', 'val'],
                           evals_result=evals_result,
                           feval=lgb_f1_micro,
                           callbacks=[wandb_callback()])                           )
        return booster, evals_result

def eval_bagged_model(config, num_bags, train_dataset, val_dataset):
    bagged_preds = np.zeros(val_dataset.num_data())
    config = dict(config) # in case you input a wandb config object
    for n in range(num_bags):
        config['seed'] += n
        booster, evals_result = train_lgbm_model(config, train_dataset,
                                                val_dataset)
        # Do I need to predict? Does the callback do it for me automatically?
        pass

In [29]:
def get_train_val_datasets(X, y, train_idx, val_idx):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    train_dataset = lgb.Dataset(X_train, label=y_train, free_raw_data=False)
    val_dataset = lgb.Dataset(X_val, label=y_val, free_raw_data=False)
    train_dataset.construct()
    val_dataset.construct()
    return train_dataset, val_dataset


def train_lgbm_model(config, train_dataset, val_dataset):
        evals_result = {}
        booster = lgb.train(config,
                           train_dataset,
                           valid_sets=[train_dataset, val_dataset],
                           valid_names=['train', 'val'],
                           evals_result=evals_result,
                           feval=lgb_f1_micro,
                           callbacks=[wandb_callback()])
        return booster, evals_result

def eval_bagged_model(config, num_bags, train_dataset, val_dataset):
    bagged_preds = np.zeros(val_dataset.num_data())
    config = dict(config) # in case you input a wandb config object
    for n in range(num_bags):
        config['seed'] += n
        booster, evals_result = train_lgbm_model(config, train_dataset,
                                                val_dataset)
        # Do I need to predict? Does the callback do it for me automatically?
        pass

In [30]:
def get_train_val_datasets(X, y, train_idx, val_idx):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    train_dataset = lgb.Dataset(X_train, label=y_train, free_raw_data=False)
    val_dataset = lgb.Dataset(X_val, label=y_val, free_raw_data=False)
    train_dataset.construct()
    val_dataset.construct()
    return train_dataset, val_dataset


def train_lgbm_model(config, train_dataset, val_dataset):
        evals_result = {}
        booster = lgb.train(config,
                           train_dataset,
                           valid_sets=[train_dataset, val_dataset],
                           valid_names=['train', 'val'],
                           evals_result=evals_result,
                           feval=lgb_f1_micro,
                           callbacks=[wandb_callback()])
        return booster, evals_result

def eval_bagged_model(config, num_bags, train_dataset, val_dataset):
    bagged_preds = np.zeros(val_dataset.num_data())
    config = dict(config) # in case you input a wandb config object
    for n in range(num_bags):
        config['seed'] += n
        booster, evals_result = train_lgbm_model(config, train_dataset,
                                                val_dataset)
        # Do I need to predict? Does the callback do it for me automatically?
        pass

In [31]:
import lightgbm as lgb
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import pickle
from pathlib import Path
from tqdm.notebook import trange, tqdm
from sklearn.model_selection import StratifiedKFold, train_test_split
from wandb.lightgbm import wandb_callback

### USE FOR LOCAL JUPYTER NOTEBOOKS ###
DOWNLOAD_DIR = Path('../download')
DATA_DIR = Path('../data')
SUBMISSIONS_DIR = Path('../submissions')
MODEL_DIR = Path('../models')
#######################################

##### GOOGLE COLAB ######
# DOWNLOAD_DIR = Path('/content/drive/MyDrive/Work/Delivery/Current/earthquake_damage_competition/download')
# SUBMISSIONS_DIR = Path('/content/drive/MyDrive/Work/Delivery/Current/earthquake_damage_competition/submissions')
# DATA_DIR = Path('/content/drive/MyDrive/Work/Delivery/Current/earthquake_damage_competition/data')
# MODEL_DIR = Path('/content/drive/MyDrive/Work/Delivery/Current/earthquake_damage_competition/model')
########################

X = pd.read_csv(DOWNLOAD_DIR / 'train_values.csv', index_col='building_id')
categorical_columns = X.select_dtypes(include='object').columns
bool_columns = [col for col in X.columns if col.startswith('has')]

X_test = pd.read_csv(DOWNLOAD_DIR / 'test_values.csv', index_col='building_id')
y = pd.read_csv(DOWNLOAD_DIR / 'train_labels.csv', index_col='building_id')

In [32]:
param = {'num_leaves': 120,
         'min_child_samples': 40,
         'learning_rate': 0.03,
         'num_boost_round': 40,
         'early_stopping_rounds': 12,
         'boosting_type': 'goss',
         'objective': 'multiclassova',
         'is_unbalance': True,
         'metric': ['multiclassova', 'multi_error'],
         'num_class': 3,
         'verbosity': -1,
         'num_threads': 8,
         'seed': 1}

run = wandb.init(project='earthquake_damage_competition',
                 config=param)

skf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)

all_eval_results = {}
all_boosters = {}
# Cross-validation loop
for i, train_idx, val_idx in enumerate(skf.split(X_all_ints, y)):
    train_dataset, val_dataset = get_train_val_datasets(X_all_ints, y,
                                                       train_idx, val_idx)
    # Perform bagged model building and evaluation to get a score
    booster, evals_results = train_lgbm_model(param, train_dataset,
                                             val_dataset)
    all_eval_results[i] = evals_results
    all_boosters[i] = booster    

In [33]:
param = {'num_leaves': 120,
         'min_child_samples': 40,
         'learning_rate': 0.03,
         'num_boost_round': 40,
         'early_stopping_rounds': 12,
         'boosting_type': 'goss',
         'objective': 'multiclassova',
         'is_unbalance': True,
         'metric': ['multiclassova', 'multi_error'],
         'num_class': 3,
         'verbosity': -1,
         'num_threads': 8,
         'seed': 1}

run = wandb.init(project='earthquake_damage_competition',
                 config=param)

skf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)

all_eval_results = {}
all_boosters = {}
i = 0
# Cross-validation loop
for train_idx, val_idx in skf.split(X_all_ints, y):
    train_dataset, val_dataset = get_train_val_datasets(X_all_ints, y,
                                                       train_idx, val_idx)
    # Perform bagged model building and evaluation to get a score
    booster, evals_results = train_lgbm_model(param, train_dataset,
                                             val_dataset)
    all_eval_results[i] = evals_results
    all_boosters[i] = booster
    i += 1