In [114]:
import numpy as np
import polars as pl
import pandas as pd

import category_encoders as ce
from feature_engine.imputation import MeanMedianImputer
from feature_engine.imputation import CategoricalImputer
from feature_engine.encoding import MeanEncoder
from feature_engine.selection import SelectBySingleFeaturePerformance

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesRegressor

import lightgbm as lgb


In [115]:
# load data
dataset = pl.read_parquet("../data/clean/dataset.parquet")
sbmssn = pl.read_parquet("../data/clean/sbmssn.parquet")

dataset.shape

(8585, 290)

In [116]:
# transform to pandas
dataset_x = dataset.drop(['target']).to_pandas()
dataset_y = dataset.get_column('target').to_pandas()
newdata = sbmssn.drop('child_id').to_pandas()

In [117]:
dataset_x.select_dtypes(include="category").columns

Index(['child_grant', 'child_years_in_programme', 'child_observe_attentive',
       'child_observe_concentrated', 'child_observe_diligent',
       'child_observe_interested', 'child_gender', 'child_stunted',
       'child_age_group', 'id_mn_best',
       ...
       'phase_natemis', 'language_child', 'language_assessment',
       'facility_type', 'sef_ind', 'language_match', 'elp_ind', 'pre_covid',
       'quintile_used', 'ses_cat'],
      dtype='object', length=192)

In [118]:
def high_cardinality_features(df, cardinality=5):
    """
    Returns a list of column names of categorical features with cardinality higher than a given threshold.
    """
    return [col for col in df.select_dtypes(include="category").columns if df[col].nunique() > cardinality]

In [119]:
# Split data into train-valid-test set
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

x_train, x_test, y_train, y_test = train_test_split(dataset_x, dataset_y, test_size=1 - train_ratio, random_state=0)
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio), random_state=0) 

In [120]:
# Define the categorical features
num_features = dataset_x.select_dtypes(exclude="category").columns
cat_features = list(dataset_x.select_dtypes(include="category").columns.values)

In [121]:
len(cat_features)

192

In [122]:
sfp = SelectBySingleFeaturePerformance(
    estimator=lgb.LGBMRegressor(random_state=0),
    scoring='explained_variance',
    threshold=0.005,
    cv=5
)

x_train = sfp.fit_transform(x_train, y_train)
x_val = sfp.transform(x_val)
x_test = sfp.transform(x_test)

In [125]:
x_train.dtypes

data_year                           float64
child_date                          float64
child_age                           float64
child_enrolment_date                float64
child_months_enrolment              float64
                                     ...   
child_years_in_programme_ordinal    float64
pra_free_play_ordinal               float64
pra_free_play_outdoor_ordinal       float64
pra_engaged_ordinal                 float64
pri_attendance_ordinal              float64
Length: 287, dtype: object

In [126]:
# get lightgbm data format
lgb_train = lgb.Dataset(data=x_train, label=y_train)
lgb_val = lgb.Dataset(data=x_val, label=y_val, reference=lgb_train)

In [128]:
# train vanilla lightgbm model
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting': 'gbdt',
    'learning_rate': 0.005,
    'num_leaves': 32,
    'feature_fraction': 1,
    'bagging_fraction': 1,
    'cat_l2': 10,
    'cat_smooth': 10,
    'verbosity': -1,
    'seed': 0
}

bst = lgb.train(
    params=params,
    num_boost_round=5000,
    train_set=lgb_train,
    valid_sets=lgb_val, 
    callbacks=[lgb.early_stopping(stopping_rounds=5, verbose=False)]
)

In [129]:
bst.current_iteration()

1381

In [130]:
# create final predictions
y_test_pred = bst.predict(x_test)
mean_squared_error(y_test_pred, y_test, squared=False)

9.50782344212471

In [None]:
# explain final model
explainer = shap.TreeExplainer(bst)
shap_values = explainer.shap_values(dataset_x)

In [None]:
ft = {
    'feature_1': [], 'feature_2': [], 'feature_3': [], 'feature_4': [], 'feature_5': [], 'feature_6': [], 'feature_7': [],
    'feature_8': [], 'feature_9': [], 'feature_10': [], 'feature_11': [], 'feature_12': [], 'feature_13': [], 'feature_14': [],
    'feature_15': []
}

shap_values = explainer.shap_values(df_to_predict)

for sv in shap_values: 
    arr = np.argsort(sv)[::-1][:15]
    for ind, a in enumerate(arr):
        place = f'feature_{ind+1}'
        ft[place].append(df_to_predict.columns[a])

final_sub = sbmssn.select('child_id')
final_sub = final_sub.with_columns(pl.Series(sbmssn_pred).alias('target'))

ft = pl.DataFrame(ft)
final_sub = pl.concat([final_sub, ft], how='horizontal')

fname = '../submission/' + time.strftime("%Y%m%d-%H%M%S") + '.csv'
final_sub.write_csv(fname)