In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.tree import export_graphviz
from IPython.display import Image

In [3]:
import pydotplus

In [4]:
import graphviz

In [2]:
df=pd.read_pickle("C:/Users/sai/Downloads/gmsc_clean.pkl")
df=df.sample(frac=1,random_state=4).reset_index(drop=True)
df.head()

Unnamed: 0,ID,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberRealEstateLoansOrLines,NumberOfDependents
0,132394,0,1.0,76,0.31934,2000.0,6,0,0.0
1,93855,0,0.266782,44,0.154811,3668.0,7,0,2.0
2,106376,0,0.479971,74,0.28883,10500.0,11,1,0.0
3,7391,0,0.460477,42,0.204466,6000.0,10,0,1.0
4,84921,0,0.392186,39,0.355366,4583.0,5,1,1.0


In [3]:
df.shape

(150000, 9)

In [4]:
df.size

1350000

In [5]:
df.isnull().sum()

ID                                      0
SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberRealEstateLoansOrLines            0
NumberOfDependents                      0
dtype: int64

In [6]:
df.duplicated().value_counts()

False    150000
dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 9 columns):
ID                                      150000 non-null int64
SeriousDlqin2yrs                        150000 non-null int64
RevolvingUtilizationOfUnsecuredLines    150000 non-null float64
age                                     150000 non-null int64
DebtRatio                               150000 non-null float64
MonthlyIncome                           150000 non-null float64
NumberOfOpenCreditLinesAndLoans         150000 non-null int64
NumberRealEstateLoansOrLines            150000 non-null int64
NumberOfDependents                      150000 non-null float64
dtypes: float64(4), int64(5)
memory usage: 10.3 MB


In [8]:
df['SeriousDlqin2yrs'].value_counts()

0    139974
1     10026
Name: SeriousDlqin2yrs, dtype: int64

In [9]:
df['SeriousDlqin2yrs'].value_counts(normalize=True)

0    0.93316
1    0.06684
Name: SeriousDlqin2yrs, dtype: float64

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [11]:
y=df['SeriousDlqin2yrs']
x=df.drop(['ID','SeriousDlqin2yrs'],axis=1)
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

In [12]:
from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score,roc_curve

In [13]:
rfc=RandomForestClassifier(n_estimators=100,random_state=4)
rfc.fit(x_train,y_train)

y_train_pred=rfc.predict(x_train)
y_train_prob=rfc.predict_proba(x_train)[:,1]
print('train - confusion matrix : ','\n',confusion_matrix(y_train,y_train_pred))
print('train - accuracy score : ','\n', accuracy_score(y_train,y_train_pred))
print('train - AUC : ', roc_auc_score(y_train,y_train_prob))

train - confusion matrix :  
 [[98011     0]
 [   10  6979]]
train - accuracy score :  
 0.9999047619047619
train - AUC :  0.9999999576641643


In [14]:
y_test_pred=rfc.predict(x_test)
y_test_prob=rfc.predict_proba(x_test)[:,1]
print('test - confusion matrix : ','\n',confusion_matrix(y_test,y_test_pred))
print('test - accuracy score : ','\n', accuracy_score(y_test,y_test_pred))
print('test - AUC : ', roc_auc_score(y_test,y_test_prob))

test - confusion matrix :  
 [[41872    91]
 [ 3006    31]]
test - accuracy score :  
 0.9311777777777778
test - AUC :  0.7257853283437655


# tuning random forest

In [15]:
import sklearn
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_wei

In [16]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

rfc=RandomForestClassifier()

param_dist={'n_estimators':sp_randint(25,250),'max_features':sp_randint(1,7),'max_depth':sp_randint(2,15),
           'min_samples_leaf':sp_randint(1,20),'min_samples_split':sp_randint(2,50),'criterion':['gini','entropy']}

r_search=RandomizedSearchCV(estimator=rfc, param_distributions=param_dist, scoring='roc_auc', n_iter=10,cv=3,n_jobs=-1,random_state=4)

print(r_search.fit(x,y))
print(r_search.best_params_)

RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000231EB1E5548>,
                                        'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000231EB1E5248>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000231EA01EFC8>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000231EB1E57C8>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000231EB1E5108>},
                   random_state=4, scoring='roc_auc')
{'criterion': 'entropy', 'max_depth': 9, 'max_features': 2, 'min_samples_leaf': 9, 'min_samples_split': 38, 'n_estimators': 203}


In [20]:
rfc=RandomForestClassifier(**r_search.best_params_,random_state=4)
rfc.fit(x_train,y_train)

y_train_pred=rfc.predict(x_train)
y_train_prob=rfc.predict_proba(x_train)[:,1]
print('train - confusion matrix : ','\n',confusion_matrix(y_train,y_train_pred))
print('train - accuracy score : ','\n', accuracy_score(y_train,y_train_pred))
print('train - AUC : ', roc_auc_score(y_train,y_train_prob))

train - confusion matrix :  
 [[98011     0]
 [ 6989     0]]
train - accuracy score :  
 0.9334380952380953
train - AUC :  0.8137380477377394


In [21]:
y_test_pred=rfc.predict(x_test)
y_test_prob=rfc.predict_proba(x_test)[:,1]
print('test - confusion matrix : ','\n',confusion_matrix(y_test,y_test_pred))
print('test - accuracy score : ','\n', accuracy_score(y_test,y_test_pred))
print('test - AUC : ', roc_auc_score(y_test,y_test_prob))

test - confusion matrix :  
 [[41963     0]
 [ 3037     0]]
test - accuracy score :  
 0.9325111111111111
test - AUC :  0.760016748373222
