In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import pandas as pd
import os
import sys
import warnings
warnings.filterwarnings('ignore')

if not os.path.abspath(os.pardir) in sys.path:
    sys.path.append(os.path.abspath(os.pardir))

from src.data_reader import read_data
from src.features_processor import FeaturesProcessor
from src.ml_experiments import *

In [2]:
DATA_PATH = '../data/'
MODELS_PATH = '../models/'

TRAIN_FNAME = 'census_income_learn.csv'
TEST_FNAME = 'census_income_test.csv'
METADATA_FNAME = 'census_income_metadata.txt'
MODELS_LOG_FNAME = 'experiments_log.csv'

COLORS = ['#4DC9C3', '#221C35', '#FCCD20', '#20C3EF', '#00B257', '#FF7700']

TARGET_NAME = 'income'
IGNORE_FEATURES = ['instance_weight']

N_JOBS = 6
GPU = True

In [3]:
train_df = read_data(DATA_PATH + TRAIN_FNAME, DATA_PATH + METADATA_FNAME, drop_cols = IGNORE_FEATURES)
test_df = read_data(DATA_PATH + TEST_FNAME, DATA_PATH + METADATA_FNAME, drop_cols = IGNORE_FEATURES)

test_df.shape, train_df.shape

Read input data file ../data/census_income_learn.csv
Read metadata file ../data/census_income_metadata.txt
Couldnt parse line 
Mapped columns
Read input data file ../data/census_income_test.csv
Read metadata file ../data/census_income_metadata.txt
Couldnt parse line 
Mapped columns


((99762, 41), (199523, 41))

In [4]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from src.constants import *

class FeaturesProcessor():

    def __init__(self, cat_encoding = 'OHE', encoders = None, 
                 age_buckets = [(18,25),(25, 35),(35,45),(45,55),(55, 65),(65,120)],
                 work_weeks_buckets = [(0,10), (10,30), (30,45), (45,52)],
                 investment_buckets = [(0,2000), (2000, 10000), (10000,100000), (100000,10000000)],
                 marital_status_mapping = {
                    'widowed':0, 'divorced':0, 'never married':0, 'separated':0, 'married-spouse absent':0,
                    'married-civilian spouse present':1, 'married-a f spouse present':1 },
                 working_h_per_week = 40,
                 scaling_values = {}
                ):

        self.cat_encoding = cat_encoding
        self.encoders = encoders
        self.age_buckets = age_buckets
        self.work_weeks_buckets = work_weeks_buckets
        self.investment_buckets = investment_buckets
        self.marital_status_mapping = marital_status_mapping
        self.working_h_per_week = working_h_per_week
        self.scaling_values = scaling_values


    def _make_encoder(self, df, col):

        mode = self.cat_encoding
        
        if mode == 'OHE':
            encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        else:
            encoder = LabelEncoder()
        
        encoder.fit(df[[col]])
        self.encoders[col] = encoder


    
    def make_calculated_features(self, df):

        #discretize age
        for ab in self.age_buckets:
            df[f'age_bucket_{ab[0]}_{ab[1]}'] = df.age.apply(lambda x:1 if (x>=ab[0] and x<ab[1]) else (0)).astype(int)

        for wb in self.work_weeks_buckets:
            df[f'ww_bucket_{wb[0]}_{wb[1]}'] = df.weeks_worked_in_year.apply(lambda x:1 if (x>=wb[0] and x<wb[1]) else (0)).astype(int)

        # calculate overall income from investments
        df['investments_income'] = (df.capital_gains + df.dividends_from_stocks - df.capital_losses).astype(int)

        # calculate discrete buckets for it
        for wb in self.investment_buckets:
            df[f'ww_bucket_{wb[0]}_{wb[1]}'] = df.investments_income.apply(lambda x:1 if (x>=wb[0] and x<wb[1]) else (0)).astype(int)

        # simplify marital status
        df['marital_status_simple'] = df.marital_stat.map(self.marital_status_mapping).astype(int)

        # look for different type of employment and simplify
        df['work_govm'] = df.class_of_worker.fillna('').apply(lambda x:1 if ('government' in x) else(0)).astype(int)
        df['work_pvt'] = df.class_of_worker.fillna('').apply(lambda x:1 if ('private' in x) else(0)).astype(int)
        df['work_self'] = df.class_of_worker.fillna('').apply(lambda x:1 if ('self-employed' in x) else(0)).astype(int)

        # calculate yearly salary 
        df['tot_salary'] = (df.wage_per_hour*df.weeks_worked_in_year*self.working_h_per_week).astype(int)
        
        
        return df
    
    def transform_features(self, df, make_calculated = False, scale = True):

        mode = self.cat_encoding

        #if want to make calculated features
        if make_calculated:
            df = self.make_calculated_features(df)

        if scale:
            print('Scaling Numerical Features')
            if  len(self.scaling_values) < 1:
                print('Missing max values to scale in 0..1 range, will calculate from current data')

        
        if mode not in ['OHE', 'LE']:
            print(f'Unsupported mode {mode} defaulting to One Hot Encoding')
            mode = 'OHE'

        if self.encoders is None:
            print('Encoders not found, creating new encoders')
            self.encoders = {}
        else:
            print('Encoders were found, using existing encoders')
        
        encoded_df = pd.DataFrame()
        
        for col in df.columns:
            #if column is object (not numerical)
            if (df[col].dtype == 'O') and (col != TARGET_NAME):

                if not (col in self.encoders.keys()):
                    self._make_encoder(df, col)
                
                encoder = self.encoders[col]

                encoded_features = encoder.transform(df[[col]])
                if mode == 'OHE':
                    feat_names = [f"{col}_{cat}" for cat in encoder.categories_[0]]
                else:
                    feat_names = [col]
                    
                encoded_df = pd.concat([encoded_df,
                                        pd.DataFrame(encoded_features, columns=feat_names)], axis=1)
            elif (df[col].dtype != 'O') and (col != TARGET_NAME):
                if scale:
                    if not (col in self.scaling_values.keys()):
                        self.scaling_values[col] = df[col].max()
                    
                    encoded_df[col] = df[col].clip(0,self.scaling_values[col]) /self.scaling_values[col]
                    
            
            else: #target column
                encoder = LabelEncoder()
        
                encoder.fit(df[[col]])
                self.encoders[col] = encoder 
                encoded_df[col] = encoder.transform(df[[col]])
                
        return encoded_df, self.encoders

In [5]:
fprocessor = FeaturesProcessor('OHE')

encoded_train_df, encoders = fprocessor.transform_features(train_df)
encoded_test_df, encoders = fprocessor.transform_features(test_df)

encoded_train_df, encoded_val_df = train_test_split(encoded_train_df, test_size=0.1, random_state=42)


encoded_train_df.shape, encoded_val_df.shape, encoded_test_df.shape

Scaling Numerical Features
Missing max values to scale in 0..1 range, will calculate from current data
Encoders not found, creating new encoders
Scaling Numerical Features
Encoders were found, using existing encoders


((179570, 404), (19953, 404), (99762, 404))

In [11]:
#feature names
feat_names = [col for col in encoded_train_df.columns if (col != TARGET_NAME)]


models = [ LogisticRegression(C=1.0, class_weight='balanced', max_iter=200, n_jobs = N_JOBS, random_state=32), #strong regularization, class balancing
          LogisticRegression(C=1.0, class_weight='balanced', max_iter=500, n_jobs = N_JOBS, random_state=32), 
          LogisticRegression(C=1.0, class_weight='balanced', max_iter=1000, n_jobs = N_JOBS, random_state=32), 
          LogisticRegression(C=0.1, class_weight='balanced', max_iter=200, n_jobs = N_JOBS, random_state=32), #weak regularization, class balancing
          LogisticRegression(C=0.1, class_weight='balanced', max_iter=500, n_jobs = N_JOBS, random_state=32), 
          LogisticRegression(C=0.1, class_weight='balanced', max_iter=1000, n_jobs = N_JOBS, random_state=32), 

          DecisionTreeClassifier(max_depth=4, class_weight='balanced', random_state=32),
          DecisionTreeClassifier(max_depth=7, class_weight='balanced', random_state=32),
          DecisionTreeClassifier(max_depth=10, class_weight='balanced', random_state=32),
          DecisionTreeClassifier(max_depth=13, class_weight='balanced', random_state=32),

          RandomForestClassifier(n_estimators=500, max_depth=3, max_features=0.3, n_jobs=N_JOBS, random_state=32), #try not very deep trees
          RandomForestClassifier(n_estimators=1000, max_depth=3, max_features=0.3, n_jobs=N_JOBS, random_state=32),
          RandomForestClassifier(n_estimators=1500, max_depth=3, max_features=0.3, n_jobs=N_JOBS, random_state=32),
          RandomForestClassifier(n_estimators=500, max_depth=6, n_jobs=N_JOBS, random_state=32), #try deeper trees
          RandomForestClassifier(n_estimators=1000, max_depth=6, n_jobs=N_JOBS, random_state=32),
          RandomForestClassifier(n_estimators=1500, max_depth=6, n_jobs=N_JOBS, random_state=32),
    
          CatBoostClassifier(iterations = 500, task_type = 'GPU' if GPU else 'CPU', silent = True, random_state=32),
          CatBoostClassifier(iterations = 1000, task_type = 'GPU' if GPU else 'CPU', silent = True,random_state=32),
          CatBoostClassifier(iterations = 1500, task_type = 'GPU' if GPU else 'CPU', silent = True,random_state=32),
          CatBoostClassifier(iterations = 2000, task_type = 'GPU' if GPU else 'CPU',  silent = True, random_state=32),
         
         
         ]

exp_name_base = 'baselines'

#folder to store artifacts
if not os.path.isdir(MODELS_PATH + exp_name_base):
    os.mkdir(MODELS_PATH + exp_name_base)

for model in models:

    model.fit(encoded_train_df[feat_names], encoded_train_df[TARGET_NAME])

    preds = model.predict(encoded_val_df[feat_names])
    metrics = evaluate_model(preds, encoded_val_df[TARGET_NAME])
    model_name = get_model_string(model)
    
    log_model(exp_name_base, model_name,
             MODELS_PATH + MODELS_LOG_FNAME,
             metrics)

    save_model_pickle(model, MODELS_PATH + f'{exp_name_base}/' + model_name[:100] + '.pkl')
    print(model_name)
    print(metrics)
    print()


LogisticRegression(C=1.0,class_weight=balanced,dual=False,fit_intercept=True,intercept_scaling=1,l1_ratio=None,max_iter=200,multi_class=deprecated,n_jobs=6,penalty=l2,random_state=32,solver=lbfgs,tol=0.0001,verbose=0,warm_start=False)
[0.4274240940254652, 0.8927986906710311, 0.2809683234612413, '[[15939   131]\n [ 2792  1091]]']

LogisticRegression(C=1.0,class_weight=balanced,dual=False,fit_intercept=True,intercept_scaling=1,l1_ratio=None,max_iter=500,multi_class=deprecated,n_jobs=6,penalty=l2,random_state=32,solver=lbfgs,tol=0.0001,verbose=0,warm_start=False)
[0.4274240940254652, 0.8927986906710311, 0.2809683234612413, '[[15939   131]\n [ 2792  1091]]']

LogisticRegression(C=1.0,class_weight=balanced,dual=False,fit_intercept=True,intercept_scaling=1,l1_ratio=None,max_iter=1000,multi_class=deprecated,n_jobs=6,penalty=l2,random_state=32,solver=lbfgs,tol=0.0001,verbose=0,warm_start=False)
[0.4274240940254652, 0.8927986906710311, 0.2809683234612413, '[[15939   131]\n [ 2792  1091]]']

Log

In [12]:
print(1)

1
