In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import pandas as pd
import os
import sys
import warnings
warnings.filterwarnings('ignore')

if not os.path.abspath(os.pardir+ '/src') in sys.path:
    sys.path.append(os.path.abspath(os.pardir) + '/src')

from data_reader import read_data
from features_processor import FeaturesProcessor
from ml_experiments import *
from config import *

### Read Data and prepare features

In [6]:
train_df = read_data(DATA_PATH + TRAIN_FNAME, DATA_PATH + METADATA_FNAME, drop_cols = IGNORE_FEATURES)
test_df = read_data(DATA_PATH + TEST_FNAME, DATA_PATH + METADATA_FNAME, drop_cols = IGNORE_FEATURES)

test_df.shape, train_df.shape

Read input data file ../data/census_income_learn.csv
Read metadata file ../data/census_income_metadata.txt
Couldnt parse line 
Mapped columns
Read input data file ../data/census_income_test.csv
Read metadata file ../data/census_income_metadata.txt
Couldnt parse line 
Mapped columns


((99762, 41), (199523, 41))

In [7]:
fprocessor = FeaturesProcessor('OHE')

encoded_train_df, encoders = fprocessor.transform_features(train_df)
encoded_test_df, encoders = fprocessor.transform_features(test_df)

encoded_train_df, encoded_val_df = train_test_split(encoded_train_df, test_size=0.1, random_state=42)


encoded_train_df.shape, encoded_val_df.shape, encoded_test_df.shape

Scaling Numerical Features
Encoders not found, creating new encoders
Scaling Numerical Features
Encoders were found, using existing encoders


((179570, 404), (19953, 404), (99762, 404))

### Run Baseline Model Search

In [8]:
#feature names
feat_names = [col for col in encoded_train_df.columns if (col != TARGET_NAME)]


models = [ LogisticRegression(C=1.0, class_weight='balanced', max_iter=200, n_jobs = N_JOBS, random_state=32), #strong regularization, class balancing
          LogisticRegression(C=1.0, class_weight='balanced', max_iter=500, n_jobs = N_JOBS, random_state=32), 
          LogisticRegression(C=1.0, class_weight='balanced', max_iter=1000, n_jobs = N_JOBS, random_state=32), 
          LogisticRegression(C=0.1, class_weight='balanced', max_iter=200, n_jobs = N_JOBS, random_state=32), #weak regularization, class balancing
          LogisticRegression(C=0.1, class_weight='balanced', max_iter=500, n_jobs = N_JOBS, random_state=32), 
          LogisticRegression(C=0.1, class_weight='balanced', max_iter=1000, n_jobs = N_JOBS, random_state=32), 

          DecisionTreeClassifier(max_depth=4, class_weight='balanced', random_state=32),
          DecisionTreeClassifier(max_depth=7, class_weight='balanced', random_state=32),
          DecisionTreeClassifier(max_depth=10, class_weight='balanced', random_state=32),
          DecisionTreeClassifier(max_depth=13, class_weight='balanced', random_state=32),

          RandomForestClassifier(n_estimators=500, max_depth=3, max_features=0.3, class_weight='balanced', n_jobs=N_JOBS, random_state=32), #try not very deep trees
          RandomForestClassifier(n_estimators=1000, max_depth=3, max_features=0.3, class_weight='balanced', n_jobs=N_JOBS, random_state=32),
          RandomForestClassifier(n_estimators=1500, max_depth=3, max_features=0.3, class_weight='balanced', n_jobs=N_JOBS, random_state=32),
          RandomForestClassifier(n_estimators=500, max_depth=6, n_jobs=N_JOBS, class_weight='balanced', random_state=32), #try deeper trees
          RandomForestClassifier(n_estimators=1000, max_depth=6, n_jobs=N_JOBS, class_weight='balanced', random_state=32),
          RandomForestClassifier(n_estimators=1500, max_depth=6, n_jobs=N_JOBS, class_weight='balanced', random_state=32),
    
          CatBoostClassifier(iterations = 500, task_type = 'GPU' if GPU else 'CPU', silent = True, random_state=32),
          CatBoostClassifier(iterations = 1000, task_type = 'GPU' if GPU else 'CPU', silent = True,random_state=32),
          CatBoostClassifier(iterations = 1500, task_type = 'GPU' if GPU else 'CPU', silent = True,random_state=32),
          CatBoostClassifier(iterations = 2000, task_type = 'GPU' if GPU else 'CPU',  silent = True, random_state=32),
         
         
         ]

exp_name_base = 'baselines'

#folder to store artifacts
if not os.path.isdir(MODELS_PATH + exp_name_base):
    os.mkdir(MODELS_PATH + exp_name_base)

for model in models:

    model.fit(encoded_train_df[feat_names], encoded_train_df[TARGET_NAME])

    preds = model.predict(encoded_val_df[feat_names])
    metrics = evaluate_model(preds, encoded_val_df[TARGET_NAME])
    model_name = get_model_string(model)
    
    log_model(exp_name_base, model_name,
             MODELS_PATH + MODELS_LOG_FNAME,
             metrics)

    save_model_pickle(model, MODELS_PATH + f'{exp_name_base}/' + model_name[:100] + '.pkl')
    print(model_name)
    print(metrics)
    print()


LogisticRegression(C=1.0,class_weight=balanced,dual=False,fit_intercept=True,intercept_scaling=1,l1_ratio=None,max_iter=200,multi_class=deprecated,n_jobs=6,penalty=l2,random_state=32,solver=lbfgs,tol=0.0001,verbose=0,warm_start=False)
[0.4274240940254652, 0.8927986906710311, 0.2809683234612413, '[[15939   131]\n [ 2792  1091]]']

LogisticRegression(C=1.0,class_weight=balanced,dual=False,fit_intercept=True,intercept_scaling=1,l1_ratio=None,max_iter=500,multi_class=deprecated,n_jobs=6,penalty=l2,random_state=32,solver=lbfgs,tol=0.0001,verbose=0,warm_start=False)
[0.4274240940254652, 0.8927986906710311, 0.2809683234612413, '[[15939   131]\n [ 2792  1091]]']

LogisticRegression(C=1.0,class_weight=balanced,dual=False,fit_intercept=True,intercept_scaling=1,l1_ratio=None,max_iter=1000,multi_class=deprecated,n_jobs=6,penalty=l2,random_state=32,solver=lbfgs,tol=0.0001,verbose=0,warm_start=False)
[0.4274240940254652, 0.8927986906710311, 0.2809683234612413, '[[15939   131]\n [ 2792  1091]]']

Log