In [77]:
import numpy as np
import polars as pl
import pandas as pd

import category_encoders as ce
from feature_engine.imputation import MeanMedianImputer
from feature_engine.imputation import CategoricalImputer
from feature_engine.encoding import MeanEncoder

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesRegressor

import lightgbm as lgb


In [67]:
# load data
dataset = pl.read_parquet("../data/clean/dataset.parquet")
sbmssn = pl.read_parquet("../data/clean/sbmssn.parquet")

dataset.shape

(8585, 747)

In [68]:
# transform to pandas
dataset_x = dataset.drop(['target']).to_pandas()
dataset_y = dataset.get_column('target').to_pandas()
newdata = sbmssn.drop('child_id').to_pandas()

In [69]:
dataset_x.select_dtypes(include="category").columns

Index(['child_grant', 'child_years_in_programme', 'child_observe_attentive',
       'child_observe_concentrated', 'child_observe_diligent',
       'child_observe_interested', 'child_gender', 'child_stunted',
       'child_attends', 'child_age_group',
       ...
       'pri_food_donor_2', 'pri_food_donor_3', 'pri_food_donor_4',
       'pri_food_donor_97', 'health_1', 'health_2', 'health_3', 'health_4',
       'health_5', 'health_97'],
      dtype='object', length=567)

In [70]:
def high_cardinality_features(df, cardinality=5):
    """
    Returns a list of column names of categorical features with cardinality higher than a given threshold.
    """
    return [col for col in df.select_dtypes(include="category").columns if df[col].nunique() > cardinality]

In [71]:
# Split data into train-valid-test set
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

x_train, x_test, y_train, y_test = train_test_split(dataset_x, dataset_y, test_size=1 - train_ratio, random_state=0)
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio), random_state=0) 

In [72]:
# Define the categorical features
num_features = dataset_x.select_dtypes(exclude="category").columns
cat_features = list(dataset_x.select_dtypes(include="category").columns.values)

In [73]:
mmi = MeanMedianImputer(imputation_method='median')
x_train = mmi.fit_transform(x_train)
x_val = mmi.transform(x_val)
x_test = mmi.transform(x_test)

ci = CategoricalImputer(variables=cat_features)
x_train = ci.fit_transform(x_train)
x_val = ci.transform(x_val)
x_test = ci.transform(x_test)

me = MeanEncoder(unseen='encode')
x_train = me.fit_transform(x_train, y_train)
x_val = me.transform(x_val)
x_test = me.transform(x_test)

  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
  return is_datetime(pd.to_datetime(column, errors="ignore", utc

In [74]:
# train randomised trees
fsmethod = ExtraTreesRegressor(n_estimators=100, min_samples_leaf=5, random_state=0)
fselect = SelectFromModel(fsmethod)

x_train = fselect.fit_transform(x_train, y_train)
x_val = fselect.transform(x_val)
x_test = fselect.transform(x_test)

In [78]:
# get lightgbm data format
lgb_train = lgb.Dataset(data=x_train, label=y_train)
lgb_val = lgb.Dataset(data=x_val, label=y_val, reference=lgb_train)

In [79]:
# train vanilla lightgbm model
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting': 'gbdt',
    'learning_rate': 0.01,
    'num_leaves': 40,
    'feature_fraction': 1,
    'bagging_fraction': 1,
    'cat_l2': 10,
    'cat_smooth': 10,
    'verbosity': -1,
    'seed': 0
}

bst = lgb.train(
    params=params,
    num_boost_round=10000,
    train_set=lgb_train,
    valid_sets=lgb_val, 
    callbacks=[lgb.early_stopping(stopping_rounds=5, verbose=False)],
)

In [83]:
# create final predictions
y_test_pred = bst.predict(x_test)
mean_squared_error(y_test_pred, y_test, squared=False)

11.315196621667116

In [None]:
# explain final model
explainer = shap.TreeExplainer(bst)
shap_values = explainer.shap_values(dataset_x)

In [None]:
ft = {
    'feature_1': [], 'feature_2': [], 'feature_3': [], 'feature_4': [], 'feature_5': [], 'feature_6': [], 'feature_7': [],
    'feature_8': [], 'feature_9': [], 'feature_10': [], 'feature_11': [], 'feature_12': [], 'feature_13': [], 'feature_14': [],
    'feature_15': []
}

shap_values = explainer.shap_values(df_to_predict)

for sv in shap_values: 
    arr = np.argsort(sv)[::-1][:15]
    for ind, a in enumerate(arr):
        place = f'feature_{ind+1}'
        ft[place].append(df_to_predict.columns[a])

final_sub = sbmssn.select('child_id')
final_sub = final_sub.with_columns(pl.Series(sbmssn_pred).alias('target'))

ft = pl.DataFrame(ft)
final_sub = pl.concat([final_sub, ft], how='horizontal')

fname = '../submission/' + time.strftime("%Y%m%d-%H%M%S") + '.csv'
final_sub.write_csv(fname)