In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

import pandas as pd
import os
import sys

if not os.path.abspath(os.pardir) in sys.path:
    sys.path.append(os.path.abspath(os.pardir))

from src.data_reader import read_data

In [33]:
DATA_PATH = '../data/'
MODELS_PATH = '../models/'

TRAIN_FNAME = 'census_income_learn.csv'
TEST_FNAME = 'census_income_test.csv'
METADATA_FNAME = 'census_income_metadata.txt'
MODELS_LOG_FNAME = 'experiments_log.csv'

COLORS = ['#4DC9C3', '#221C35', '#FCCD20', '#20C3EF', '#00B257', '#FF7700']

TARGET_NAME = 'income'

N_JOBS = 6
GPU = True

In [26]:
class FeaturesProcessor():

    def __init__(self, cat_encoding = 'OHE', encoders = None):

        self.cat_encoding = cat_encoding
        self.encoders = encoders


    def _make_encoder(self, df, col):

        mode = self.cat_encoding
        
        if mode == 'OHE':
            encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        else:
            encoder = LabelEncoder()
        
        encoder.fit(df[[col]])
        self.encoders[col] = encoder        
    
    def transform_cat_features(self, df):

        mode = self.cat_encoding
        
        if mode not in ['OHE', 'LE']:
            print(f'Unsupported mode {mode} defaulting to One Hot Encoding')
            mode = 'OHE'

        if self.encoders is None:
            print('Encoders not found, creating new encoders')
            self.encoders = {}
        else:
            print('Encoders were found, using existing encoders')
        
        encoded_df = pd.DataFrame()
        
        for col in df.columns:
            #if column is object (not numerical)
            if (train_df[col].dtype == 'O') and (col != TARGET_NAME):

                if not (col in self.encoders.keys()):
                    self._make_encoder(df, col)
                
                encoder = self.encoders[col]

                encoded_features = encoder.transform(df[[col]])
                if mode == 'OHE':
                    feat_names = [f"{col}_{cat}" for cat in encoder.categories_[0]]
                else:
                    feat_names = [col]
                    
                encoded_df = pd.concat([encoded_df,
                                        pd.DataFrame(encoded_features, columns=feat_names)], axis=1)
            elif (train_df[col].dtype != 'O') and (col != TARGET_NAME):
                encoded_df[col] = df[col]
            
            else: #target column
                encoder = LabelEncoder()
        
                encoder.fit(df[[col]])
                self.encoders[col] = encoder 
                encoded_df[col] = encoder.transform(df[[col]])
                
        return encoded_df, self.encoders

In [27]:
train_df = read_data(DATA_PATH + TRAIN_FNAME, DATA_PATH + METADATA_FNAME)
test_df = read_data(DATA_PATH + TEST_FNAME, DATA_PATH + METADATA_FNAME)
test_df.shape, train_df.shape

Read input data file ../data/census_income_learn.csv


  df = df.applymap(lambda x:x.lower().strip() if (isinstance(x, str)) else (x))


Read metadata file ../data/census_income_metadata.txt
Couldnt parse line 
Mapped columns


  df = df.applymap(lambda x:np.nan if (isinstance(x, str) and (('not in universe' in x) or ('?' in x) or (x == 'na')


Read input data file ../data/census_income_test.csv


  df = df.applymap(lambda x:x.lower().strip() if (isinstance(x, str)) else (x))


Read metadata file ../data/census_income_metadata.txt
Couldnt parse line 
Mapped columns


  df = df.applymap(lambda x:np.nan if (isinstance(x, str) and (('not in universe' in x) or ('?' in x) or (x == 'na')


((99762, 42), (199523, 42))

In [28]:
fprocessor = FeaturesProcessor('OHE')

encoded_train_df, encoders = fprocessor.transform_cat_features(train_df)
encoded_test_df, encoders = fprocessor.transform_cat_features(test_df)

encoded_train_df, encoded_val_df = train_test_split(encoded_train_df, test_size=0.1, random_state=42)


encoded_train_df.shape, encoded_val_df.shape, encoded_test_df.shape

Encoders not found, creating new encoders


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Encoders were found, using existing encoders


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


((179570, 405), (19953, 405), (99762, 405))

In [52]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

def evaluate_model(y_true, y_pred):

    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred)

    return [f1, precision, recall, str(conf_matrix)]


def get_model_string(model):
    model_name = model.__class__.__name__
    params = model.get_params()
    param_str = ",".join(f"{k}={v}" for k, v in params.items())
    return f"{model_name}({param_str})"

def log_model(exp_name, fname, metrics, col_names = ['expriment_name', 'f1-score', 'precision', 'recall', 'conf_matrix']):

    #if log file exists, append
    if os.path.isfile(fname):
        existing_logs = pd.read_csv(fname)
        pd.concat([existing_logs,
                  pd.DataFrame([[exp_name, *metrics]], columns = existing_logs.columns)]).to_csv(fname, index = False)

    #log file doesnt exist yet
    else:
        pd.DataFrame([[exp_name, *metrics]], columns = col_names).to_csv(fname, index = False)



In [53]:
#feature names
feat_names = [col for col in encoded_train_df.columns if (col != TARGET_NAME)]



models = [ LogisticRegression(C=1.0, class_weight='balanced', max_iter=200, n_jobs = N_JOBS), #strong regularization, class balancing
          LogisticRegression(C=1.0, class_weight='balanced', max_iter=500, n_jobs = N_JOBS), 
          LogisticRegression(C=1.0, class_weight='balanced', max_iter=1000, n_jobs = N_JOBS), 
          LogisticRegression(C=0.1, class_weight='balanced', max_iter=200, n_jobs = N_JOBS), #weak regularization, class balancing
          LogisticRegression(C=0.1, class_weight='balanced', max_iter=500, n_jobs = N_JOBS), 
          LogisticRegression(C=0.1, class_weight='balanced', max_iter=1000, n_jobs = N_JOBS), 

          DecisionTreeClassifier(max_depth=4, class_weight='balanced'),
          DecisionTreeClassifier(max_depth=7, class_weight='balanced'),
          DecisionTreeClassifier(max_depth=10, class_weight='balanced'),
          DecisionTreeClassifier(max_depth=13, class_weight='balanced'),

          RandomForestClassifier(n_estimators=500, max_depth=3, max_features=0.3, n_jobs=N_JOBS), #try not very deep trees
          RandomForestClassifier(n_estimators=1000, max_depth=3, max_features=0.3, n_jobs=N_JOBS),
          RandomForestClassifier(n_estimators=1500, max_depth=3, max_features=0.3, n_jobs=N_JOBS),
          RandomForestClassifier(n_estimators=500, max_depth=6, n_jobs=N_JOBS), #try deeper trees
          RandomForestClassifier(n_estimators=1000, max_depth=6, n_jobs=N_JOBS),
          RandomForestClassifier(n_estimators=1500, max_depth=6, n_jobs=N_JOBS),
    
          CatBoostClassifier(iterations = 500, task_type = 'GPU' if GPU else 'CPU'),
          CatBoostClassifier(iterations = 1000, task_type = 'GPU' if GPU else 'CPU'),
          CatBoostClassifier(iterations = 1500, task_type = 'GPU' if GPU else 'CPU'),
          CatBoostClassifier(iterations = 2000, task_type = 'GPU' if GPU else 'CPU'),
         
         
         ]

exp_name_base = 'baselines_{}'
for model in models:

    model.fit(encoded_train_df[feat_names], encoded_train_df[TARGET_NAME])

    metrics = evaluate_model(model.predict(encoded_val_df[feat_names]), encoded_val_df[TARGET_NAME])
    model_name = get_model_string(model)
    
    log_model(exp_name_base.format(model_name),
             MODELS_PATH + MODELS_LOG_FNAME,
             metrics)

    print(model_name)
    print(metrics)
    print()


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(C=1.0,class_weight=balanced,dual=False,fit_intercept=True,intercept_scaling=1,l1_ratio=None,max_iter=200,multi_class=deprecated,n_jobs=6,penalty=l2,random_state=None,solver=lbfgs,tol=0.0001,verbose=0,warm_start=False)
[0.36731891520768967, 0.8756137479541735, 0.23240660295395307, '[[15197   152]\n [ 3534  1070]]']



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(C=1.0,class_weight=balanced,dual=False,fit_intercept=True,intercept_scaling=1,l1_ratio=None,max_iter=500,multi_class=deprecated,n_jobs=6,penalty=l2,random_state=None,solver=lbfgs,tol=0.0001,verbose=0,warm_start=False)
[0.3863072011351543, 0.8911620294599017, 0.24660326086956522, '[[15404   133]\n [ 3327  1089]]']



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(C=1.0,class_weight=balanced,dual=False,fit_intercept=True,intercept_scaling=1,l1_ratio=None,max_iter=1000,multi_class=deprecated,n_jobs=6,penalty=l2,random_state=None,solver=lbfgs,tol=0.0001,verbose=0,warm_start=False)
[0.39985052316890884, 0.8756137479541735, 0.25907990314769974, '[[15671   152]\n [ 3060  1070]]']



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(C=0.1,class_weight=balanced,dual=False,fit_intercept=True,intercept_scaling=1,l1_ratio=None,max_iter=200,multi_class=deprecated,n_jobs=6,penalty=l2,random_state=None,solver=lbfgs,tol=0.0001,verbose=0,warm_start=False)
[0.3467007509186771, 0.8878887070376432, 0.21540599563232082, '[[14779   137]\n [ 3952  1085]]']



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(C=0.1,class_weight=balanced,dual=False,fit_intercept=True,intercept_scaling=1,l1_ratio=None,max_iter=500,multi_class=deprecated,n_jobs=6,penalty=l2,random_state=None,solver=lbfgs,tol=0.0001,verbose=0,warm_start=False)
[0.38637174455940065, 0.886252045826514, 0.24703467153284672, '[[15430   139]\n [ 3301  1083]]']



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(C=0.1,class_weight=balanced,dual=False,fit_intercept=True,intercept_scaling=1,l1_ratio=None,max_iter=1000,multi_class=deprecated,n_jobs=6,penalty=l2,random_state=None,solver=lbfgs,tol=0.0001,verbose=0,warm_start=False)
[0.39626099706744866, 0.8846153846153846, 0.2553141237600378, '[[15578   141]\n [ 3153  1081]]']

DecisionTreeClassifier(ccp_alpha=0.0,class_weight=balanced,criterion=gini,max_depth=4,max_features=None,max_leaf_nodes=None,min_impurity_decrease=0.0,min_samples_leaf=1,min_samples_split=2,min_weight_fraction_leaf=0.0,monotonic_cst=None,random_state=None,splitter=best)
[0.34113816095518684, 0.900163666121113, 0.21044576238760282, '[[14604   122]\n [ 4127  1100]]']

DecisionTreeClassifier(ccp_alpha=0.0,class_weight=balanced,criterion=gini,max_depth=7,max_features=None,max_leaf_nodes=None,min_impurity_decrease=0.0,min_samples_leaf=1,min_samples_split=2,min_weight_fraction_leaf=0.0,monotonic_cst=None,random_state=None,splitter=best)
[0.4104716227018385, 0.840