# Using LightGBM as designed (not through sklearn API)

## Automatically Encode Categorical Columns

I've been encoding the geo_level columns as numeric this whole time. Can it perform better by using categorical columns?

LGBM can handle categorical features directly. No need to OHE them. But they must be ints. 

1. Load in X
2. Label Encode all the categorical features
 - All `object` dypes are categorical and need to be LabelEncoded

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import pickle
import lightgbm as lgb
from pathlib import Path

### USE FOR LOCAL JUPYTER NOTEBOOKS ###
DATA_DIR = Path('../download')
SUBMISSIONS_DIR = Path('../submissions')
MODEL_DIR = Path('../models')
#######################################

X = pd.read_csv(DATA_DIR / 'train_values.csv', index_col='building_id')
categorical_columns = X.select_dtypes(include='object').columns
bool_columns = [col for col in X.columns if col.startswith('has')]

X_test = pd.read_csv(DATA_DIR / 'test_values.csv', index_col='building_id')
y = pd.read_csv(DATA_DIR / 'train_labels.csv', index_col='building_id')

In [53]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer

t = [('label_encoder', OrdinalEncoder(dtype=int), categorical_columns)]

ct = ColumnTransformer(transformers=t, remainder='passthrough')

In [54]:
X_all_ints = ct.fit_transform(X)

In [13]:
# Note that append for pandas objects works differently to append with
# python objects e.g. python append modifes the list in-place
# pandas append returns a new object, leaving the original unmodified
not_categorical_columns = X.select_dtypes(exclude='object').columns
cols_ordered_after_ordinal_encoding = categorical_columns.append(not_categorical_columns)

In [14]:
cols_ordered_after_ordinal_encoding

Index(['land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'legal_ownership_status', 'geo_level_1_id',
       'geo_level_2_id', 'geo_level_3_id', 'count_floors_pre_eq', 'age',
       'area_percentage', 'height_percentage', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'count_families', 'has_secondary_use', 'has_secondary_use_agriculture',
       'has_secondary_use_hotel', 'has_secondary_use_rental',
       'has_secondary_use_institution', 'has_secondary_use_school',
       'has_secondary_use_industry', 'has_

In [17]:
geo_cols = pd.Index(['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id'])
cat_cols_plus_geo = categorical_columns.append(geo_cols)

In [36]:
list(cat_cols_plus_geo)

['land_surface_condition',
 'foundation_type',
 'roof_type',
 'ground_floor_type',
 'other_floor_type',
 'position',
 'plan_configuration',
 'legal_ownership_status',
 'geo_level_1_id',
 'geo_level_2_id',
 'geo_level_3_id']

In [59]:
# train_data = lgb.Dataset(X_all_ints, label=y, feature_name=list(cols_ordered_after_ordinal_encoding),
#                         categorical_feature=list(cat_cols_plus_geo))

train_data = lgb.Dataset(X_all_ints, label=y)

In [60]:
validation_data = lgb.Dataset('validation.svm', reference=train_data)

In [65]:
param = {'num_leaves': 120,
#          'num_iterations': 240,
         'min_child_samples': 40,
         'learning_rate': 0.2,
         'boosting_type': 'goss',
         'objective': 'multiclass',
         'num_class': 3}

In [66]:
# LGBM seem to hate using plurals. Why???
num_round = 10
lgb.cv(param, train_data, num_round, nfold=5)

LightGBMError: Label must be in [0, 3), but found 3 in label