In [206]:
import numpy as np

from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.metrics import (roc_auc_score, 
                             accuracy_score)

from run_test import ( set_seed, 
                       na_imputer,
                       RealNormalizer,
                       data_processing )

from data_processor import get_low_variance_objects

from sklearn.linear_model import LogisticRegression, Ridge, Lasso

In [214]:
import pandas as pd

TRAIN_PATH = '../../../data_main/credit/TRAIN.csv'
VAL_PATH = '../../../data_main/credit/VAL.csv'
train = pd.read_csv(TRAIN_PATH)#.sample(10000)
val = pd.read_csv(VAL_PATH)#.sample(3000)

val.drop(['Unnamed: 0','Unnamed: 0.1'], axis=1, inplace=True)
train.drop(['Unnamed: 0','Unnamed: 0.1'], axis=1, inplace=True)

train.fillna(train.median(), inplace=True)
val.fillna(train.median(), inplace=True)

TARGET = 'SeriousDlqin2yrs'

In [215]:
class RealNormalizer:
    def __init__(self):
        self.mapper = dict()
    
    def fit_transform(self, data, columns_to_normalize):
        for col in columns_to_normalize:
            mean, std = data[col].mean(), data[col].std()
            if std == 0: continue
            self.mapper[col] = (mean, std)          
            data[col] = ((data[col] - mean)/std).astype(float)
        
        return data
    
    def transform(self, data):
        for col in self.mapper:
            mean, std = self.mapper[col]
            data[col] = ((data[col] - mean)/std).astype(float)
        
        return data

In [216]:
class MinMax:
    def __init__(self):
        self.mapper = dict()
    
    def fit_transform(self, data, columns_to_normalize):
        for col in columns_to_normalize:
            min_, diff = data[col].min(), data[col].max() - data[col].min()
            self.mapper[col] = (min_, diff)          
            data[col] = ((data[col] - min_)/(diff)).astype(float)
        
        return data
    
    def transform(self, data):
        for col in self.mapper:
            min_, diff = self.mapper[col]
            data[col] = ((data[col] - min_)/(diff)).astype(float)
        
        return data

In [222]:
model = LogisticRegression(penalty ='l2',random_state = 0,solver = 'lbfgs',n_jobs = -1, C=1)

model.fit(train.drop(TARGET, axis=1), train[TARGET])
preds = model.predict_proba(val.drop(TARGET, axis=1))
roc_auc_score(val[TARGET], preds[:,1])

0.6801420784201471

In [223]:
low_var_objects, high_var_objects, low_var_real = get_low_variance_objects(train.drop(TARGET, axis=1), th=100)

Low variance items all: 7, Objects all: 0
Low variance objects: 0, High var objects: 0


In [224]:
categoical = low_var_objects.union(low_var_real)
low_var_n_objects = categoical.union(high_var_objects)
real_features = list(set(train.columns).difference(low_var_n_objects))
real_features.remove(TARGET)

In [225]:
categoical

{'NumberOfDependents',
 'NumberOfOpenCreditLinesAndLoans',
 'NumberOfTime30-59DaysPastDueNotWorse',
 'NumberOfTime60-89DaysPastDueNotWorse',
 'NumberOfTimes90DaysLate',
 'NumberRealEstateLoansOrLines',
 'age'}

In [226]:
train  = pd.get_dummies(train, columns=categoical)
val  = pd.get_dummies(val, columns=categoical)

train, val = train.align(val, join='outer', axis=1, fill_value=0)

print(train.shape)
print(val.shape)

(89957, 230)
(30102, 230)


In [227]:
normalizer = RealNormalizer()
train = normalizer.fit_transform(train, real_features)
val = normalizer.transform(val)

In [228]:
train[real_features]

Unnamed: 0,DebtRatio,MonthlyIncome,RevolvingUtilizationOfUnsecuredLines
0,-0.209829,0.169660,-0.024511
1,-0.210242,-0.247245,-0.023656
2,-0.210264,-0.218982,-0.024995
3,-0.210088,-0.189696,-0.026987
4,3.252453,-0.068206,-0.026573
...,...,...,...
89952,2.295492,-0.068206,-0.024037
89953,-0.210071,-0.196091,-0.026214
89954,-0.209881,-0.056440,-0.026599
89955,-0.210316,-0.048000,-0.027941


In [229]:
train.head()

Unnamed: 0,DebtRatio,MonthlyIncome,NumberOfDependents_0.0,NumberOfDependents_1.0,NumberOfDependents_10.0,NumberOfDependents_13.0,NumberOfDependents_2.0,NumberOfDependents_3.0,NumberOfDependents_4.0,NumberOfDependents_5.0,...,age_90,age_91,age_92,age_93,age_94,age_95,age_96,age_97,age_98,age_99
0,-0.209829,0.16966,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-0.210242,-0.247245,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-0.210264,-0.218982,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,-0.210088,-0.189696,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3.252453,-0.068206,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [101]:
# train.drop(real_features, axis=1, inplace=True)
# val.drop(real_features, axis=1, inplace=True)

In [230]:
model = LogisticRegression(penalty ='l2',random_state = 0, solver = 'lbfgs',n_jobs = -1, max_iter=50)

model.fit(train.drop(TARGET, axis=1), train[TARGET])
preds = model.predict_proba(val.drop(TARGET, axis=1))
roc_auc_score(val[TARGET], preds[:,1])

0.8178740562219329

In [135]:
import autogluon as ag
from autogluon import TabularPrediction as task

In [231]:
hyperparameters = {'LR':{}}

predictor = task.fit(train_data=train, 
                     output_directory="a_gluon_tests", 
                     label=TARGET, 
                     eval_metric='roc_auc', 
                     hyperparameters=hyperparameters,)
                     #num_bagging_folds=None,
                     #num_bagging_sets=None,)

Beginning AutoGluon training ...
AutoGluon will save models to a_gluon_tests/
AutoGluon Version:  0.0.12
Train Data Rows:    89957
Train Data Columns: 230
Preprocessing data ...
Here are the 2 unique label values in your data:  [1, 0]
AutoGluon infers your prediction problem is: binary  (because only two unique label-values observed).
If this is wrong, please specify `problem_type` argument in fit() instead (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])

Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Train Data Class Count: 2
Feature Generator processed 89957 data points with 224 features
Original Features (raw dtypes):
	float64 features: 3
	uint8 features: 221
Original Features (inferred dtypes):
	float features: 3
	int features: 221
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 3
	int features: 221
Final Features:
	float features: 3
	int features: 221
	Data preprocessing and feature engineering runti

In [232]:
performance = predictor.evaluate(val)

Predictive performance on given dataset: roc_auc = 0.8341854608820453


In [249]:
predictor.feature_types._get_feature_type(train.loc[0,:])

TypeError: _get_feature_type() missing 1 required positional argument: 'feature_types_dict'

In [180]:
preds = predictor.predict_proba(val)
roc_auc_score(val[TARGET], preds)

0.8334359875421953

In [187]:
predictor.transform_features(train)[real_features]

Unnamed: 0,DebtRatio,MonthlyIncome,RevolvingUtilizationOfUnsecuredLines
0,2.615568e-06,0.003031,0.000026
1,3.969896e-07,0.000864,0.000033
2,2.772414e-07,0.001011,0.000023
3,1.223471e-06,0.001163,0.000007
4,1.859929e-02,0.001795,0.000011
...,...,...,...
89952,1.345924e-02,0.001795,0.000030
89953,1.316911e-06,0.001130,0.000013
89954,2.334071e-06,0.001856,0.000010
89955,0.000000e+00,0.001900,0.000000


In [185]:
train[real_features]

Unnamed: 0,DebtRatio,MonthlyIncome,RevolvingUtilizationOfUnsecuredLines
0,2.615568e-06,0.003031,0.000026
1,3.969896e-07,0.000864,0.000033
2,2.772414e-07,0.001011,0.000023
3,1.223471e-06,0.001163,0.000007
4,1.859929e-02,0.001795,0.000011
...,...,...,...
89952,1.345924e-02,0.001795,0.000030
89953,1.316911e-06,0.001130,0.000013
89954,2.334071e-06,0.001856,0.000010
89955,0.000000e+00,0.001900,0.000000


In [196]:
cols_init = train.columns
cols_gl = predictor.transform_features(train).columns
cols_dropped = list(set(cols_init).difference(cols_gl))
train[cols_dropped].sum()

NumberOfTime30-59DaysPastDueNotWorse_13       0
NumberOfOpenCreditLinesAndLoans_53            0
NumberOfTimes90DaysLate_17                    0
NumberOfOpenCreditLinesAndLoans_51            0
SeriousDlqin2yrs                           6037
NumberOfTimes90DaysLate_12                    0
dtype: int64

In [190]:
predictor.info()

In [197]:
cols_dropped.remove(TARGET)

In [199]:
train.head()

Unnamed: 0,DebtRatio,MonthlyIncome,NumberOfDependents_0.0,NumberOfDependents_1.0,NumberOfDependents_10.0,NumberOfDependents_13.0,NumberOfDependents_2.0,NumberOfDependents_3.0,NumberOfDependents_4.0,NumberOfDependents_5.0,...,age_90,age_91,age_92,age_93,age_94,age_95,age_96,age_97,age_98,age_99
0,0.802982,9120.0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.121876,2600.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.085113,3042.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.375607,3500.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5710.0,5400.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [205]:
train.drop(cols_dropped, axis=1, inplace=True)
val.drop(cols_dropped, axis=1, inplace=True)

model = LogisticRegression(penalty ='l2',random_state = 0,solver = 'lbfgs',n_jobs = -1,)

model.fit(train.drop(TARGET, axis=1), train[TARGET])
preds = model.predict_proba(val.drop(TARGET, axis=1))
roc_auc_score(val[TARGET], preds[:,1])

0.8175466321232132

In [218]:
predictor.fit_summary()

*** Summary of fit() ***
Estimated performance of each model:
                     model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer
0              LinearModel    0.86819       0.435163  52.335602                0.435163          52.335602            0       True
1  weighted_ensemble_k0_l1    0.86819       0.437831  52.339645                0.002668           0.004043            1       True
Number of models trained: 2
Types of models trained:
{'LinearModel', 'WeightedEnsembleModel'}
Bagging used: False 
Stack-ensembling used: False 
Hyperparameter-tuning used: False 
User-specified hyperparameters:
{'default': {'LR': [{}]}}
Plot summary of models saved to file: a_gluon_tests/SummaryOfModels.html
*** End of fit() summary ***


{'model_types': {'LinearModel': 'LinearModel',
  'weighted_ensemble_k0_l1': 'WeightedEnsembleModel'},
 'model_performance': {'LinearModel': 0.8681899861145144,
  'weighted_ensemble_k0_l1': 0.8681899861145144},
 'model_best': 'weighted_ensemble_k0_l1',
 'model_paths': {'LinearModel': 'a_gluon_tests/models/LinearModel/',
  'weighted_ensemble_k0_l1': 'a_gluon_tests/models/weighted_ensemble_k0_l1/'},
 'model_fit_times': {'LinearModel': 52.335602045059204,
  'weighted_ensemble_k0_l1': 0.0040433406829833984},
 'model_pred_times': {'LinearModel': 0.4351630210876465,
  'weighted_ensemble_k0_l1': 0.002668142318725586},
 'num_bagging_folds': 0,
 'stack_ensemble_levels': 0,
 'feature_prune': False,
 'hyperparameter_tune': False,
 'hyperparameters_userspecified': {'default': {'LR': [{}]}},
 'num_classes': 2,
 'model_hyperparams': {'LinearModel': {'C': 1,
   'vectorizer_dict_size': 75000,
   'proc.ngram_range': (1, 5),
   'proc.skew_threshold': 0.99,
   'proc.impute_strategy': 'median',
   'penalty