In [1]:
import numpy as np
import pandas as pd  
from pandas.plotting import scatter_matrix
import seaborn as sns 
from seaborn import load_dataset
import statsmodels.api as sm
import matplotlib.pyplot as plt

#load imbalanced-learn library
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline

#Import model library
import lightgbm as lgb

#Load sklearn libraries
from sklearn import metrics
from sklearn.metrics import get_scorer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import  StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import metrics

#Load my custom transformers
from modules.preprocess import Drop_Columns
from modules.preprocess import InteractionsTransformer
from modules.preprocess import OrderFeatures
from modules.preprocess import Impute_Missing
from modules.preprocess import Predict_Missing
from modules.preprocess import drop_rows


columns = [
 'age',
 'workclass',
 'fnlwgt',
 'education',
 'educational num',
 'marital',
 'occupation',
 'relationship',
 'race',
 'gender',
 'capital gain',
 'capital loss',
 'hours per week',
 'country',
 'income'
 ]

#Load the dataset
train = pd.read_csv('data/adult.data', sep=",", names=columns)
test = pd.read_csv('data/adult.test', sep=",", names=columns)

#Drop missing
train = drop_rows(train, ['workclass', 'occupation', 'country'], drop_val =' ?')
test = drop_rows(test, ['workclass', 'occupation', 'country'], drop_val =' ?')

#preprocess
train['income'] = train['income'].str.strip(' .')
test['income'] = test['income'].str.strip(' .')

#Define and encode labels
train['income'].replace({'<=50K':0, '>50K':1}, inplace=True)
test['income'].replace({'<=50K':0, '>50K':1}, inplace=True)
y_train=train['income']
y_test=test['income']

#Define features and labels, the column education is already covered with the ordinal feature education.num
X_train = train.drop(['income'], axis='columns')
X_test = test.drop(['income'], axis='columns')

#Define model
model = lgb.LGBMClassifier(random_state=0)#class_weight={1:3, 0:1}

transformer = make_column_transformer(
    (StandardScaler(), ['age', 'educational num', 'capital gain', 'capital loss', 'hours per week']), #dont forget fnlwgt if in dataset!
    (OrdinalEncoder(), ['gender']),
    (OneHotEncoder(handle_unknown='ignore', sparse=False), ['workclass', 'marital', 'occupation', 'relationship']),
    remainder='passthrough')


#Compose pipeline
pipe = imbpipeline(steps = [
        ['orderfeatures', OrderFeatures()],
        ['drop_columns', Drop_Columns(['education', 'fnlwgt', 'race', 'country'])],
        #('impute_missing', Impute_Missing(missing_value=' ?', pred_columns=['workclass', 'occupation', 'country'], strategy = 'most_frequent' , label_missing=False)),
        #['predict_missing', Predict_Missing(pred_columns=['workclass', 'occupation', 'country'], print_cross_val_score=False, label_missing = False)],
        ['transformer', transformer],
        #['interactions', InteractionsTransformer(use_cache=False)],  
        ['smote', SMOTE(random_state=0)],
        ['model', model]
]
)

#Calculate scores
skf = StratifiedKFold(n_splits=5, random_state=0,  shuffle=True)

#Apply pipeline on data
pipe.fit(X_train, y_train)
y_test_predicted = pipe.predict(X_test)
y_train_predicted = pipe.predict(X_train)

print("Cross val train f1:", cross_val_score(pipe, X_train, y_train, cv=skf, scoring='f1').mean())
print("Overall train f1: ", get_scorer('f1')._score_func(y_train, y_train_predicted))
print("Overall test f1: ", get_scorer('f1')._score_func(y_test, y_test_predicted))

Cross val train f1: 0.7234510027260497
Overall train f1:  0.7448500406987666
Overall test f1:  0.7214786488209051


# Optimization of lightgbm model

Here, I shortly discuss the approaches to improve the f1 score of the lightgbm model. To reproduce the results you just have to uncomment the transformers and functions in the cell above. For each step I describe which transformers were used.

Note: While optinizing the model, only the cross validation score on the training set was considered. The score on the test set was calculated at the very end of the project!

<table>
<thead>
  <tr>
    <th></th>
    <th>How?</th>
    <th>Results</th>
    <th>Conclusion</th>
  </tr>
</thead>
<tbody>
  <tr>
    <td>Base model</td>
    <td>Model<br>lgb.LGBMClassifier(random_state=0)<br><br>Transformers<br>(StandardScaler(), ['age', ..]),<br>(OrdinalEncoder(), ['gender']),<br>(OneHotEncoder(handle_unknown='ignore', sparse=False), ['workclass', ..]  <br>OrderFeatures()<br>Drop_Columns(['education', 'fnlwgt'])<br><br><br>(have a look at the code above to see all parameters of the transformers)</td>
    <td><br><b>Cross val train f1: 0.71776</b><br>(Overall train f1:  0.74101)<br>(Overall test f1:  0.71205)</td>
    <td>I already dropped the column education in the base model because it is redundant.<br>Education_num contains the same information but is in a ordered numeric format.<br><br>Additionally, I dropped the column fnlwgt because it does not contain <br>relevant information for improving the f1 score. <br>However, one can think about weighing the observations with the fnlwgt values. <br>I don't do this here.<br>The goal of this ML project is just to train a model with high f1 score.</td>
  </tr>
  <tr>
    <td>Drop missing values</td>
    <td>I made two functions to do this:<br><br>drop_rows(train, ['workclass', 'occupation', 'country'], drop_val =' ?')<br>drop_rows(test,  ['workclass', 'occupation', 'country'], drop_val =' ?')</td>
    <td><b>Cross val train f1: 0.71884</b><br>(Overall train f1:  0.745140)<br>(Overall test f1:  0.71326)<br></td>
    <td>Better score when dropping rows with missing values.</td>
  </tr>
  <tr>
    <td>Impute missing</td>
    <td>I created a custom transformer to do this:<br><br>Impute_Missing(... strategy = 'most_frequent' , label_missing = False)<br><br>Of course the fitting is only done on the training data! We never wanna have data leakage!</td>
    <td><b>Cross val train f1: 0.71569</b><br>(Overall train f1:  0.73912)<br>(Overall test f1:  0.70599)<br><br></td>
    <td>Imputing with most frequent value is worse than dropping rows.</td>
  </tr>
  <tr>
    <td>Impute missing and <br>label imputed features</td>
    <td>I created a custom transformer to do this:<br><br>Impute_Missing(... strategy = 'most_frequent' , label_missing = True)<br><br>Of course the fitting is only done on the training data! We never wanna have data leakage!</td>
    <td><b>Cross val train f1: 0.71570</b><br>(Overall train f1:  0.73902)<br>(Overall test f1:  0.709732)<br></td>
    <td>Adding label to imputed values helps but score is still worse.</td>
  </tr>
  <tr>
    <td>Predict missing values<br>by ML</td>
    <td>I created a custom transformer to do this. Of course I fitted the model only to the training data!  <br>We never want data leakage!<br><br>Predict_Missing(pred_columns=['workclass', 'occupation', 'country'], .., label_missing = False)<br><br>Of course the fitting is only done on the training data! We never wanna have data leakage!</td>
    <td><b>Cross val train f1: 0.71552</b><br>(Overall train f1:  0.73953)<br>(Overall test f1:  0.70722)<br></td>
    <td>Predicting values with ML is worse than dropping rows.</td>
  </tr>
  <tr>
    <td>Predict missing values<br>by ML and label <br>predicted values</td>
    <td>I created a custom transformer to do this:<br><br>Predict_Missing(pred_columns=['workclass', 'occupation', 'country'], .., label_missing = True)<br><br>Of course the fitting is only done on the training data! We never wanna have data leakage!</td>
    <td><b>Cross val train f1: 0.71795</b><br>(Overall train f1:  0.74126)<br>(Overall test f1:  0.70733)<br></td>
    <td>Adding label to predicted values helps but score is still worse.<br><br>=&gt; we will drop rows with missing values for subsequent steps!</td>
  </tr>
  <tr>
    <td>Oversampling <br>with class weights</td>
    <td>LogisticRegression(.., class_weight={1:3, 0:1})<br>(see code above for all parameters)</td>
    <td><b>Cross val train f1: 0.72066</b><br>(Overall train f1:  0.74377)<br>(Overall test f1:  0.71726)<br></td>
    <td>Using&nbsp;&nbsp;class weights significantly improves the score.<br><br><br></td>
  </tr>
  <tr>
    <td>Oversampling <br>with SMOTE</td>
    <td>SMOTE(random_state=0)<br><br>Of course the oversampling is only done on the training data and withing the CV steps! <br>We never wanna have data leakage!<br>For this purpose I used imbpipeline <br>from <a href="https://imbalanced-learn.org/stable/">imbalanced-learn</a><br></td>
    <td><b>Cross val train f1: 0.72175</b><br>(Overall train f1:  0.74162)<br>(Overall test f1:  0.71813)<br><br><br></td>
    <td>Oversampling the minor class significantly improves the score.<br>SMOTE leads to a slightly better score than the class weights.<br>I will thus use SMOTE to handle the imbalanced targets for the next steps.<br></td>
  </tr>
  <tr>
    <td>Drop columns with low <br>permutation importance</td>
    <td>According to the calculated feature importance <br>race and country contain at least relevant information.<br>(Note: other columns were tested as well but the last two ones are documented here)<br><br>Drop_Columns(['education', 'fnlwgt', 'race', 'country'])</td>
    <td>Drop Country:<br><b>Cross val train f1: 0.72477</b><br>Overall train f1:  0.74273<br>Overall test f1:  0.71930<br><br><br>Drop Race:<br><b>Cross val train f1: 0.72335</b><br>Overall train f1:  0.74155<br>Overall test f1:  0.71976<br><br>Drop Country and Race<br><b>Cross val train f1: 0.72345</b><br>Overall train f1:  0.74485<br>Overall test f1:  0.72147<br><br></td>
    <td>Race and Country were determined to have the lowest feature importance <br>(see <a href="Feature_Importance.ipynb">Feature_Importance.ipynb</a>).<br><br>When dropping country these features I observed a better f1 score. <br><br><br>Thus, I only excluded these features for the proceeding steps.<br><br>Note:<br>I also tested dropping other features with low permutation importance <br>but with other ones I could not increase the f1 score <br>(fnlwgt and education were already excluded in the base model!).<br></td>
  </tr>
  <tr>
    <td>Add interactions</td>
    <td>I made a custom transformer to do this. Of course I fitted the model only to the training data!  <br>We never want data leakage!<br><br>InteractionsTransformer()<br>Of course the fitting is only done on the training data! We never wanna have data leakage!</td>
    <td><b>Cross val train f1: 0.72691</b><br>(Overall train f1:  0.75141)<br>(Overall test f1:  0.72104)<br></td>
    <td>Decision tree based models can learn interactions themselves through recursive splitting. <br>However, I still could increase the f1 score by creating manual features. <br>I will not use it for subsequent steps. </td>
  </tr>
  <tr>
    <td>Hyperparameter tuning</td>
    <td>Because not all penalties work with all solvers I had to do several rounds of grid search.<br><br>Furthermore, its better to start with a larger parameter space.<br>In this way I can narrowing down to the optimal values.<br><br>Round 1<br>'model__num_leaves':[10,31, 100],<br>'model__min_child_samples':[5, 20, 40],<br>'model__max_depth':[2, 10, 100],<br>'model__learning_rate':[0.05, 0.1, 0.4],  <br>'model__reg_alpha':[0, 0.01, 0.1],<br>'model__reg_lambda': [0, 0.01, 0.1]<br><br><br>Round 2<br>'model__num_leaves':[80,100, 120],<br>'model__min_child_samples':[10, 15, 20],<br>'model__max_depth':[2, 5, 7],<br>'model__learning_rate':[0.2, 0.3, 0.4],  <br>'model__reg_alpha':[0.02, 0.03, 0.04],<br>'model__reg_lambda': [0, 0.05, 0.1]<br></td>
    <td><b>Cross val train f1: 0.72795</b><br>(Overall train f1:  0.75872<br>(Overall test f1:  0.71947)</td>
    <td>By tuning the hyperparameters I could again slightly improve the score.<br><br><br>Note: Due to limited computational power and time I could not check to many parameters.</td>
  </tr>
</tbody>
</table>

# Hyperparameter tuning

In [2]:
y_train=train['income']
y_test=test['income']

#Define features and labels, the column education is already covered with the ordinal feature education.num
X_train = train.drop(['income'], axis='columns')
X_test = test.drop(['income'], axis='columns')

#Define model
model = lgb.LGBMClassifier(random_state=0)#class_weight={1:3, 0:1}

transformer = make_column_transformer(
    (StandardScaler(), ['age', 'educational num', 'capital gain', 'capital loss', 'hours per week']), #dont forget fnlwgt if in dataset!
    (OrdinalEncoder(), ['gender']),
    (OneHotEncoder(handle_unknown='ignore', sparse=False), ['workclass', 'marital', 'occupation', 'relationship']),
    remainder='passthrough')


#Compose pipeline
pipe = imbpipeline(steps = [
        ['orderfeatures', OrderFeatures()],
        ['drop_columns', Drop_Columns(['education', 'fnlwgt', 'race', 'country'])],
        #('impute_missing', Impute_Missing(missing_value=' ?', pred_columns=['workclass', 'occupation', 'country'], strategy = 'most_frequent' , label_missing=False)),
        #['predict_missing', Predict_Missing(pred_columns=['workclass', 'occupation', 'country'], print_cross_val_score=False, label_missing = False)],
        ['transformer', transformer],
        #['interactions', InteractionsTransformer(use_cache=False)],  
        ['smote', SMOTE(random_state=0)],
        ['model', model]
]
)

#Calculate scores
skf = StratifiedKFold(n_splits=2, random_state=0,  shuffle=True)

pipe = GridSearchCV(
        pipe,
            param_grid={

                        'model__num_leaves':[80,100, 120],
                        'model__min_child_samples':[10, 15, 20],
                        'model__max_depth':[2, 5, 7],
                        'model__learning_rate':[0.2, 0.3, 0.4],  # 10**(np.linspace(-2, 0, 3))
                        'model__reg_alpha':[0.02, 0.03, 0.04],
                        'model__reg_lambda': [0, 0.05, 0.1]
                        #'model__subsample': [0.8, 1], 
                        #'model__colsample_bytree': [1, 0.9]
                        #'model__subsample': [0.8]

}
,
         cv=skf, refit=True, scoring = 'f1'
)

#Apply pipeline on data
pipe.fit(X_train, y_train)
y_test_predicted = pipe.predict(X_test)
y_train_predicted = pipe.predict(X_train)

print(pipe.best_score_)
print(pipe.best_params_)
print(pipe.best_estimator_)



0.7212035252940138
{'model__learning_rate': 0.2, 'model__max_depth': 5, 'model__min_child_samples': 10, 'model__num_leaves': 70, 'model__reg_alpha': 0.03, 'model__reg_lambda': 0.02}
Pipeline(steps=[('orderfeatures', OrderFeatures()),
                ('drop_columns',
                 Drop_Columns(columns=['education', 'fnlwgt', 'race',
                                       'country'])),
                ('transformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['age', 'educational num',
                                                   'capital gain',
                                                   'capital loss',
                                                   'hours per week']),
                                                 ('ordinalencoder',
                                

# Final model

In [2]:
y_train=train['income']
y_test=test['income']

#Define features and labels, the column education is already covered with the ordinal feature education.num
X_train = train.drop(['income'], axis='columns')
X_test = test.drop(['income'], axis='columns')

#Define model
model = lgb.LGBMClassifier(random_state=0, learning_rate = 0.3, max_depth = 5, min_child_samples= 15, num_leaves= 80, reg_alpha= 0.03, reg_lambda = 0)

transformer = make_column_transformer(
    (StandardScaler(), ['age', 'educational num', 'capital gain', 'capital loss', 'hours per week']), #dont forget fnlwgt if in dataset!
    (OrdinalEncoder(), ['gender']),
    (OneHotEncoder(handle_unknown='ignore', sparse=False), ['workclass', 'marital', 'occupation', 'relationship']),
    remainder='passthrough')


#Compose pipeline
pipe = imbpipeline(steps = [
        ['orderfeatures', OrderFeatures()],
        ['drop_columns', Drop_Columns(['education', 'fnlwgt', 'race', 'country'])],
        #('impute_missing', Impute_Missing(missing_value=' ?', pred_columns=['workclass', 'occupation', 'country'], strategy = 'most_frequent' , label_missing=False)),
        #['predict_missing', Predict_Missing(pred_columns=['workclass', 'occupation', 'country'], print_cross_val_score=False, label_missing = False)],
        ['transformer', transformer],
        #['interactions', InteractionsTransformer(use_cache=False)],  
        ['smote', SMOTE(random_state=0)],
        ['model', model]
]
)

#Calculate scores
skf = StratifiedKFold(n_splits=5, random_state=0,  shuffle=True)

#Apply pipeline on data
pipe.fit(X_train, y_train)
y_test_predicted = pipe.predict(X_test)
y_train_predicted = pipe.predict(X_train)

print("Cross val train f1:", cross_val_score(pipe, X_train, y_train, cv=skf, scoring='f1').mean())
print("Overall train f1: ", get_scorer('f1')._score_func(y_train, y_train_predicted))
print("Overall test f1: ", get_scorer('f1')._score_func(y_test, y_test_predicted))

Cross val train f1: 0.7279547575038213
Overall train f1:  0.7587253414264036
Overall test f1:  0.7194761843625627
