In [13]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import pickle
import matplotlib
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler



In [14]:
DF = pd.read_csv('Datasets/credit_score_br.csv')

In [15]:
DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25128 entries, 0 to 25127
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Applicant_ID          25128 non-null  int64 
 1   Applicant_Gender      25128 non-null  object
 2   Owned_Car             25128 non-null  int64 
 3   Owned_Realty          25128 non-null  int64 
 4   Total_Children        25128 non-null  int64 
 5   Total_Income          25128 non-null  int64 
 6   Income_Type           25128 non-null  object
 7   Education_Type        25128 non-null  object
 8   Family_Status         25128 non-null  object
 9   Housing_Type          25128 non-null  object
 10  Owned_Mobile_Phone    25128 non-null  int64 
 11  Owned_Work_Phone      25128 non-null  int64 
 12  Owned_Phone           25128 non-null  int64 
 13  Owned_Email           25128 non-null  int64 
 14  Job_Title             25128 non-null  object
 15  Total_Family_Members  25128 non-null

In [16]:
DF.describe()

Unnamed: 0,Applicant_ID,Owned_Car,Owned_Realty,Total_Children,Total_Income,Owned_Mobile_Phone,Owned_Work_Phone,Owned_Phone,Owned_Email,Total_Family_Members,Applicant_Age,Years_of_Working,Total_Bad_Debt,Total_Good_Debt,Status
count,25128.0,25128.0,25128.0,25128.0,25128.0,25128.0,25128.0,25128.0,25128.0,25128.0,25128.0,25128.0,25128.0,25128.0,25128.0
mean,5078835.0,0.418378,0.654927,0.509472,194836.5,1.0,0.273758,0.292741,0.100684,2.291309,40.995503,7.690903,0.333095,21.057545,0.995185
std,41943.78,0.493303,0.475402,0.762937,104521.1,0.0,0.445895,0.45503,0.300916,0.928871,9.552832,6.415007,1.569448,14.735347,0.069227
min,5008806.0,0.0,0.0,0.0,27000.0,1.0,0.0,0.0,0.0,1.0,21.0,1.0,0.0,1.0,0.0
25%,5042226.0,0.0,0.0,0.0,135000.0,1.0,0.0,0.0,0.0,2.0,33.0,3.0,0.0,9.0,1.0
50%,5079004.0,0.0,1.0,0.0,180000.0,1.0,0.0,0.0,0.0,2.0,40.0,6.0,0.0,18.0,1.0
75%,5115603.0,1.0,1.0,1.0,225000.0,1.0,1.0,1.0,0.0,3.0,48.0,10.0,0.0,31.0,1.0
max,5150487.0,1.0,1.0,5.0,1575000.0,1.0,1.0,1.0,1.0,7.0,68.0,44.0,49.0,61.0,1.0


In [17]:
X = DF[['Total_Income','Total_Family_Members','Years_of_Working','Total_Bad_Debt','Total_Good_Debt']]

y = DF['Status']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Resampling

In [19]:
ros = RandomOverSampler(random_state=42)

# fit predictor and target variable
x_ros, y_ros = ros.fit_resample(X_train, y_train)

print('Original dataset shape', y_train.shape)
print('Resample dataset shape', y_ros.shape)

Original dataset shape (20102,)
Resample dataset shape (39998,)


In [20]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()

# fit predictor and target variable
x_smote, y_smote = smote.fit_resample(X_train, y_train)

In [21]:
forest = RandomForestClassifier(max_depth=2, random_state=0)
forest.fit(x_smote, y_smote)
pred_forest = forest.predict(X_test)
accuracy_score(y_test,pred_forest)

0.9745324313569439

In [22]:
x_smote

Unnamed: 0,Total_Income,Total_Family_Members,Years_of_Working,Total_Bad_Debt,Total_Good_Debt
0,405000,2,2,0,7
1,405000,2,6,0,1
2,121500,2,1,6,22
3,135000,3,4,0,9
4,121500,1,34,0,9
...,...,...,...,...,...
39993,585000,2,3,20,16
39994,225000,1,5,7,7
39995,72000,2,1,18,13
39996,225000,1,8,2,2


## PICKLE

In [23]:
import pickle

In [24]:
pickle.dump(forest, open('model.pkl', 'wb'))