In [7]:
# 1) Business Question
# 2) Data Collection
# 3) Preprocessing
# 4) Model(s) Creation
# 5) Model Evaluation and Comparison
# 6) Conclusion and Future Improvements
#https://www.kaggle.com/c/home-credit-default-risk/data?select=installments_payments.csv
import pandas as pd
import numpy as np
import math
from datetime import datetime
import scipy
from scipy import stats
#Graph
import seaborn as sns
import matplotlib.pyplot as plt
#sklearn
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from  sklearn.impute import SimpleImputer   
from imblearn.over_sampling import SMOTE
pd.set_option('display.max_rows', 10)

In [8]:
# #Correlation preset - part 1
# df = pd.read_csv("./data/application_train.csv" )
# df_corr = pd.DataFrame( df.corr()['TARGET'])
# df_corr['MissingValue'] = [df[list(df_corr.index)[x]].isna().sum() for x in range(len(pd.DataFrame(df_corr)['TARGET']))]
# #df_corr = df_corr[df_corr['MissingValue']>30000].sort_values(by=['TARGET'])
# df_corr['Abs_correlation'] = abs(df_corr['TARGET'])
# df_corr.sort_values(by=['Abs_correlation'],ascending=False,inplace=True)
# print(df_corr)
# print(len(list(df_corr.index)))
# "','".join(list(df_corr.index))

In [9]:
# #Correlation preset - part 2

# pd.set_option('display.max_rows', None)

# #Keep certain percent
# df_corr_Work = df_corr[(df_corr['TARGET']>=-0.04)&(df_corr['TARGET']<=0.04)]
# print('shape: ',df_corr_Work.shape)
# print(df_corr_Work)
# print(list(df_corr_Work.index))


In [113]:
##                                      (Drop columns with missing data & 4% correlation)
# SMOTE                                 (Imbalanced data: score 15 points more)
# Outliner   >>> (remove 3% standard deviation)
# Mean on numbers; Median on labels     (Median performs little worse)  diff show outliners
# drop_rate = 1                         (Drop nothing to performs little better))
# Standardization                       (MinMaxScaler performs little worse))
# Regression                            (Score 68)
# Other Models: XGboost

#Load DataFrame
df = pd.read_csv("./data/application_train.csv" ) #  index_col=0  ID which is used for mapping other excel files goes to index column

print('original shape: ' , df.shape)


#Step 1:    Drop columns with missing values  #---------------------Data Preprocessing---------------------#

#drop ID
df.drop(columns=['SK_ID_CURR'], inplace=True)

#Step 2:    Fill in missing values            #---------------------Data Preprocessing---------------------#

#   >>>categorical columns<<<   
drop_rate = 1 #0.3 mean the data with 70% more  
drop_col_lst_1 = []
for col in df.columns:
    if (df[col].isna().sum() / df.shape[0] <= drop_rate )&(type(df[col][0])==str):
        df[col].fillna(  df[col].mode()[0]  ,inplace=True)  #   >>>categorical columns<<<   Minor missing values
    elif (df[col].isna().sum() / df.shape[0] > drop_rate )&(type(df[col][0])==str):
        drop_col_lst_1.append(col)                          #   >>>categorical columns<<<   Huge missing values
df.drop(columns=drop_col_lst_1 , inplace=True)

#   >>>Numerical columns<<<   
drop_col_lst_2 = []
from sklearn.impute import SimpleImputer
for col in df.columns:
    if (df[col].isna().sum() / df.shape[0] > drop_rate )&((type(df[col][0])==np.int64)|(type(df[col][0])==np.float64)):
        drop_col_lst_2.append(col) 
print('Columns that is dropped due to missing values: \n',drop_col_lst_1,drop_col_lst_2)
df.drop(columns=drop_col_lst_2 , inplace=True)
#   Replace missing values with mean
df[list(df.select_dtypes(include=["int64","float64"]).columns)]     =       pd.DataFrame(   SimpleImputer(missing_values=np.NaN, strategy='mean').fit_transform(  df[list(df.select_dtypes(include=["int64","float64"]).columns)].values),index = df.index,columns=  list(df.select_dtypes(include=["int64","float64"]).columns) )   


#Step 0:    fix unbalanced dataset (SMOTE)    #---------------------Data Preprocessing---------------------#
print('df shape before outliners:' , df.shape)
df = df[(np.abs( df[df.drop(columns='TARGET').select_dtypes(include=[np.number]).columns].apply(stats.zscore))       < float(3)).all(axis=1)]
print('df shape after outliners:' , df.shape)

df.drop(df.query('TARGET == 0').sample(((df.query('TARGET == 0').shape[0])-3*(df.query('TARGET == 1').shape[0]))).index,inplace=True)  #Downscale the data with '0'
print(df.TARGET.value_counts())
#Step 3:    Standardization & Labelling       #---------------------Data Preprocessing---------------------#

X = df.drop(columns=['TARGET'])
y = df[['TARGET']]

# Standardization (0 mean, 1 stdev)
# from sklearn.preprocessing import StandardScaler
# X[list(X.select_dtypes(include=["int64","float64"]).columns)] = pd.DataFrame(   StandardScaler().fit_transform(X[list(X.select_dtypes(include=["int64","float64"]).columns)].values), index = X.index, columns = list(X[list(X.select_dtypes(include=["int64","float64"]).columns)].columns))   
#https://stackoverflow.com/questions/35723472/how-to-use-sklearn-fit-transform-with-pandas-and-return-dataframe-instead-of-num

#MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
X[list(X.select_dtypes(include=["int64","float64"]).columns)] = pd.DataFrame(   MinMaxScaler(feature_range=(0, 1)).fit_transform(X[list(X.select_dtypes(include=["int64","float64"]).columns)].values), index = X.index, columns = list(X[list(X.select_dtypes(include=["int64","float64"]).columns)].columns)) 

X= pd.get_dummies(X, drop_first=True)       #One Hot Encoder  #drop_first=True -> reduce one column


#Step 4:    fix unbalanced dataset (SMOTE)    #---------------------Data Preprocessing---------------------#
#df.drop(df.query('TARGET == 0').sample((282686-24825)).index,inplace=True)  #Sample size of '0' is 282686; Sample size of '1' is 24825; 

#Oversampling the data     ## https://towardsdatascience.com/5-smote-techniques-for-oversampling-your-imbalance-data-b8155bdbe2b5

print('X.shape = ' , X.shape)
y.head()
X.head()

original shape:  (307511, 122)
Columns that is dropped due to missing values: 
 [] []
df shape before outliners: (307511, 121)
df shape after outliners: (151404, 121)
0.0    37443
1.0    12481
Name: TARGET, dtype: int64
X.shape =  (49924, 225)


Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,FONDKAPREMONT_MODE_reg oper spec account,HOUSETYPE_MODE_specific housing,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_Yes
8,0.0,0.109707,0.554188,0.466236,0.543662,0.396196,0.29112,1.0,0.522134,0.511741,...,0,0,0,0,0,0,1,0,0,0
9,0.0,0.138426,0.204705,0.267475,0.225352,0.419288,0.61243,0.040503,0.071098,0.445324,...,0,0,0,0,0,0,1,0,0,0
13,0.5,0.253303,0.496676,0.395085,0.408451,0.351938,0.634288,0.037867,0.958628,0.317632,...,0,0,0,0,0,0,1,0,0,0
21,0.5,0.224584,0.618422,0.520917,0.552113,0.539125,0.592683,0.041461,0.852078,0.680561,...,0,0,0,0,0,0,1,0,0,0
22,0.5,0.540494,0.257314,0.447131,0.253521,0.441701,0.802077,0.034528,0.992665,0.650132,...,0,0,0,0,0,0,1,0,0,0


In [63]:
#Step 4:    Build regression model       #---------------------Modelling---------------------#

# Import module to split dataset------------------------------------
from sklearn.model_selection import train_test_split

# Split data set into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)       #


# print('Sample size before SMOTE method: ' , X_train.shape)
# X_train,y_train = SMOTE(random_state = 42).fit_resample(X_train,y_train)
# print('Sample size after SMOTE method: ' , X_train.shape)

# Import module for fitting------------------------------------
from sklearn.linear_model import LogisticRegression

# Create instance (i.e. object) of LogisticRegression
logmodel = LogisticRegression()

# Fit the model using the training data
logmodel.fit(X_train, y_train)
#print('coef :  ' , logmodel.coef_)
#print('intercept :  ', logmodel.intercept_)

#Predicting the target for test data
y_pred = logmodel.predict(X_test)

#Classification Metrics
from sklearn.metrics import confusion_matrix
print('confusion_matrix:    \n',    confusion_matrix(y_test, y_pred))
print('''
True positive      False positive
False Negative      True negative
''')
from sklearn.metrics import classification_report
print('classification_report:   \n',    classification_report(y_test, y_pred))
from sklearn.metrics import accuracy_score
print('accuracy_score:  \n',    accuracy_score(y_test, y_pred) , ' ; with data size of ' , X_train.shape)     #Same as logmodel.score(X_test,y_test)
#--Other models--

confusion_matrix:    
 [[7114  375]
 [1886  610]]

True positive      False positive
False Negative      True negative

classification_report:   
               precision    recall  f1-score   support

         0.0       0.79      0.95      0.86      7489
         1.0       0.62      0.24      0.35      2496

    accuracy                           0.77      9985
   macro avg       0.70      0.60      0.61      9985
weighted avg       0.75      0.77      0.73      9985

accuracy_score:  
 0.7735603405107662  ; with data size of  (39939, 225)


In [None]:
0.684692849949647
#Step 5:    Build Forest model with PCA  #---------------------Modelling---------------------# 
# XGboost (no need scaling, no need worry missing data)
# parameter >  give class
#Outliner
#SMOTE

In [90]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=1000, n_classes=3,
                           n_informative=4, weights=[0.2, 0.3, 0.5],
                           random_state=0)
                        
clf = BalancedRandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X, y)  
print(pd.DataFrame(X))
print('feature_importances_' , clf.feature_importances_)  
print(clf.predict([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
X.shape

            0         1         2         3         4         5         6  \
0   -3.028981 -0.413400 -1.540603 -1.940079 -0.240909  0.408641 -0.198666   
1   -0.357759 -0.446519  0.265239  0.260461  0.615010 -0.399651  0.147452   
2    0.911451  0.438051  1.633826  1.451226  0.240125 -0.245653 -2.585751   
3    2.389625  0.333127  0.616318  2.045917 -0.424432 -1.531849 -2.172485   
4    1.472375  1.214850 -0.112170  1.236217  1.885997 -1.226767 -2.179582   
..        ...       ...       ...       ...       ...       ...       ...   
995 -1.773791  1.548493 -0.071203 -0.135549 -0.945316  0.613645  0.820349   
996 -0.510020  0.171239  0.121757  0.626236 -0.170996  0.006801 -0.644402   
997 -2.816729 -0.763501 -0.212769 -1.413166  2.395144 -1.306546  1.027126   
998  0.182859 -0.339780  0.488050  0.252066  1.106527 -1.226481 -2.388367   
999 -0.763217  2.125145 -1.275344  0.010466  0.756689  1.608059 -1.204031   

            7         8         9        10        11        12        13  

(1000, 20)

In [106]:
#Step 4:    Split data                    #---------------------Modelling---------------------#

# Import module to split dataset------------------------------------
from sklearn.model_selection import train_test_split

# Split data set into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42 , stratify=y)       #

print(y_train['TARGET'].value_counts())
print(y_test['TARGET'].value_counts())



# print('Sample size before SMOTE method: ' , X_train.shape)
# X_train,y_train = SMOTE(random_state = 42).fit_resample(X_train,y_train)
# print('Sample size after SMOTE method: ' , X_train.shape)

#Step 5:    Build            model       #---------------------Modelling---------------------#  RamdonForest

# # Import module for fitting------------------------------------
# from sklearn.ensemble import RandomForestClassifier

# rnd_clf = RandomForestClassifier(n_estimators=2000, n_jobs=-1, random_state=42)  #, max_leaf_nodes=16
# rnd_clf.fit(X_train, y_train)

# y_pred = rnd_clf.predict(X_test)
#Step 5:    Build            model       #---------------------Modelling---------------------#  Multiclass classification with under-sampling

# print(type(y_train))
# print('Training target statistics: {}'.format(Counter(y_train)))
# print('Testing target statistics: {}'.format(Counter(y_test)))

# # Create a pipeline
# pipeline = make_pipeline(NearMiss(version=2),
#                          LinearSVC(random_state=42))
# pipeline.fit(X_train, y_train)

#Step 5:    Build regression model       #---------------------Modelling---------------------#  RamdonForest > unlimitted depth and leaf
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

ada_clf = AdaBoostClassifier(
    #DecisionTreeClassifier(max_depth=2), n_estimators=200,learning_rate=0.05, random_state=42)
    DecisionTreeClassifier(), n_estimators=200,learning_rate=0.05, random_state=42)
ada_clf.fit(X_train, y_train)
y_pred = ada_clf.predict(X_test)

# # #5.1 - Hyperparameter Tuning (Max Depth)#---------------------Modelling---------------------#  Find Max depth
# # parameters = {'max_depth' : list(range(1,11))}
# # from sklearn.model_selection import GridSearchCV
# # search = GridSearchCV(DecisionTreeClassifier(),param_grid=parameters,cv=10)
# # search.fit(X_train,y_train)
# # print('search.best_score_:' , search.best_score_)
# # print('search.best_params_:', search.best_params_)

# # #5.2 - Hyperparameter Tuning (Max Depth)#---------------------Modelling---------------------#  List all score of diff depth
# # # List of values to try for max_depth:
# # max_depth_range = list(range(1, 11))
# # # List to store the accuracy for each value of max_depth:
# # accuracy = []
# # for depth in max_depth_range:
    
# #     clf = DecisionTreeClassifier(max_depth = depth, random_state=0)
# #     clf.fit(X_train, y_train)
# #     score = clf.score(X_test, y_test)
# #     accuracy.append(score)

#Step 6:    Classification Metrics        #---------------------Score---------------------#
from sklearn.metrics import confusion_matrix
print('confusion_matrix:    \n',    confusion_matrix(y_test, y_pred))
print('''
True positive      False positive
False Negative      True negative
''')
from sklearn.metrics import classification_report
print('classification_report:   \n',    classification_report(y_test, y_pred))
from sklearn.metrics import accuracy_score
print('accuracy_score:  \n',    accuracy_score(y_test, y_pred) , ' ; with data size of ' , X_train.shape)     #Same as logmodel.score(X_test,y_test)
#--Other models--

0.0    111138
1.0      9985
Name: TARGET, dtype: int64
0.0    27785
1.0     2496
Name: TARGET, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Training target statistics: Counter({'TARGET': 1})
Testing target statistics: Counter({'TARGET': 1})
confusion_matrix:    
 [[25306  2479]
 [ 2089   407]]

True positive      False positive
False Negative      True negative

classification_report:   
               precision    recall  f1-score   support

         0.0       0.92      0.91      0.92     27785
         1.0       0.14      0.16      0.15      2496

    accuracy                           0.85     30281
   macro avg       0.53      0.54      0.53     30281
weighted avg       0.86      0.85      0.85     30281

accuracy_score:  
 0.8491463293814603  ; with data size of  (121123, 227)


In [99]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [91]:
#https://imbalanced-learn.org/stable/auto_examples/applications/plot_multi_class_under_sampling.html
# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
# License: MIT

from collections import Counter

from sklearn.datasets import load_iris
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

from imblearn.datasets import make_imbalance
from imblearn.under_sampling import NearMiss
from imblearn.pipeline import make_pipeline
from imblearn.metrics import classification_report_imbalanced

print(__doc__)

RANDOM_STATE = 42

# Create a folder to fetch the dataset
iris = load_iris()
X, y = make_imbalance(iris.data, iris.target,
                      sampling_strategy={0: 25, 1: 50, 2: 50},
                      random_state=RANDOM_STATE)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=RANDOM_STATE)

print('Training target statistics: {}'.format(Counter(y_train)))
print('Testing target statistics: {}'.format(Counter(y_test)))

# Create a pipeline
pipeline = make_pipeline(NearMiss(version=2),
                         LinearSVC(random_state=RANDOM_STATE))
pipeline.fit(X_train, y_train)

# Classify and report the results
print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))

Automatically created module for IPython interactive environment
Training target statistics: Counter({1: 38, 2: 38, 0: 17})
Testing target statistics: Counter({1: 12, 2: 12, 0: 8})
                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      1.00      1.00      1.00      1.00      1.00         8
          1       1.00      0.83      1.00      0.91      0.91      0.82        12
          2       0.86      1.00      0.90      0.92      0.95      0.91        12

avg / total       0.95      0.94      0.96      0.94      0.95      0.90        32



In [112]:
# from sklearn.model_selection import StratifiedKFold
# from lightgbm import LGBMClassifier
# folds = StratifiedKFold(n_splits= 10, shuffle=True, random_state=2020)
# sub_preds = np.zeros(df_test.shape[0])
# feats = [f for f in df_train.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]

# for n_fold, (train_idx, valid_idx) in enumerate(folds.split(df_train[feats], df_train['TARGET'])):
#     train_x, train_y = df_train[feats].iloc[train_idx], df_train['TARGET'].iloc[train_idx]
#     valid_x, valid_y = df_train[feats].iloc[valid_idx], df_train['TARGET'].iloc[valid_idx]

#     lgb = LGBMClassifier(nthread=4, n_estimators=12000, learning_rate=0.02, num_leaves=31,
#         colsample_bytree=0.85,subsample=0.9, max_depth=8, reg_alpha=0.0415, reg_lambda=0.073,
#         min_split_gain=0.022, min_child_weight=39.32, silent=-1, verbose=-1)

#     lgb.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
#         eval_metric= 'auc', verbose= 200, early_stopping_rounds= 100)

#     sub_preds += lgb.predict_proba(df_test[feats], num_iteration=lgb.best_iteration_)[:, 1] / folds.n_splits

#     del lgb, train_x, train_y, valid_x, valid_y
#     gc.collect()

# df_test['TARGET'] = sub_preds

#Step 4:    Split data                    #---------------------Modelling---------------------#

# Import module to split dataset------------------------------------
from sklearn.model_selection import train_test_split

# Split data set into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42 , stratify=y)       #

#Step 5:   LGBMClassifier                    #---------------------Modelling---------------------#

from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
folds = StratifiedKFold(n_splits= 10, shuffle=True, random_state=2020)
#sub_preds = np.zeros(df_test.shape[0])


lgb = LGBMClassifier(nthread=4, n_estimators=12000, learning_rate=0.02, num_leaves=31,
        colsample_bytree=0.85,subsample=0.9, max_depth=8, reg_alpha=0.0415, reg_lambda=0.073,
        min_split_gain=0.022, min_child_weight=39.32, silent=-1, verbose=-1)

lgb.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], 
        eval_metric= 'auc', verbose= 200, early_stopping_rounds= 100)

print( lgb.predict_proba(X, num_iteration=lgb.best_iteration_)[:, 1] / folds.n_splits  )





LightGBMError: Do not support special JSON characters in feature name.

In [122]:

# Import module to split dataset------------------------------------
from sklearn.model_selection import train_test_split

# Split data set into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42 , stratify=y)       #

import re
X_train = X_train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold

opt_parameters = {'colsample_bytree': 0.9234, 'min_child_samples': 399, 'min_child_weight': 0.1, 'num_leaves': 13, 'reg_alpha': 2, 'reg_lambda': 5, 'subsample': 0.855,'imbalanced':True}

clf = lgb.LGBMClassifier(max_depth=-1, random_state=314, silent=True, metric='None', n_jobs=4, n_estimators=5000, colsample_bytree= 0.9234, min_child_samples= 399, min_child_weight= 0.1, num_leaves=13, reg_alpha= 2, reg_lambda= 5,subsample= 0.855,imbalanced=True)
#clf_final = lgb.LGBMClassifier(**clf.get_params())
#set optimal parameters
#clf_final.set_params(**opt_parameters)

#Train the final model with learning rate decay
#clf_final.fit(X_train, y_train, **fit_params, callbacks=[lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_0995)])
#clf_final.fit(X_train, y_train)
clf.fit(X_train,y_train)

X_test = X_test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

y_pred = clf.predict(X_test,y_test)


from sklearn.metrics import confusion_matrix
print('confusion_matrix:    \n',    confusion_matrix(y_test, y_pred))
print('''
True positive      False positive
False Negative      True negative
''')
from sklearn.metrics import classification_report
print('classification_report:   \n',    classification_report(y_test, y_pred))
from sklearn.metrics import accuracy_score
print('accuracy_score:  \n',    accuracy_score(y_test, y_pred) , ' ; with data size of ' , X_train.shape)     #Same as logmodel.score(X_test,y_test)
#--Other models--

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().