In [1]:

# Data wrangling
import pandas as pd
import numpy as np
import missingno
from collections import Counter

# Data visualisation
import seaborn as sns
import matplotlib.pyplot as plt

# Machine learning models
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report,confusion_matrix

# Model evaluation
from sklearn.model_selection import cross_val_score

# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV

# Remove warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv(r"C:\Users\AYODEJI OYINBO & CO\Desktop\Credit score\train.csv")

In [3]:
test = pd.read_csv(r"C:\Users\AYODEJI OYINBO & CO\Desktop\Credit score\test.csv")

In [4]:
sample_submission = pd.read_csv(r"C:\Users\AYODEJI OYINBO & CO\Desktop\Credit score\sample_submission.csv")

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 708 entries, 0 to 707
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            708 non-null    int64  
 1   checking_balance      427 non-null    float64
 2   months_loan_duration  708 non-null    int64  
 3   credit_history        708 non-null    object 
 4   purpose               708 non-null    object 
 5   amount                708 non-null    int64  
 6   savings_balance       579 non-null    float64
 7   employment_length     661 non-null    object 
 8   installment_rate      708 non-null    int64  
 9   personal_status       498 non-null    object 
 10  other_debtors         708 non-null    object 
 11  residence_history     621 non-null    object 
 12  property              708 non-null    object 
 13  age                   708 non-null    int64  
 14  installment_plan      708 non-null    object 
 15  housing               7

In [6]:
train.describe()

Unnamed: 0.1,Unnamed: 0,checking_balance,months_loan_duration,amount,savings_balance,installment_rate,age,existing_credits,default,dependents,telephone,Id
count,708.0,427.0,708.0,708.0,579.0,708.0,708.0,708.0,708.0,708.0,300.0,708.0
mean,489.059322,103.133489,20.772599,3237.413842,789.058722,2.966102,35.877119,1.423729,0.298023,1.153955,2344898000.0,489.059322
std,288.583071,220.425812,11.957515,2740.753878,2997.126937,1.116411,11.556534,0.595696,0.457713,0.361161,3043729.0,288.583071
min,0.0,-50.0,4.0,250.0,0.0,1.0,19.0,1.0,0.0,1.0,2340000000.0,0.0
25%,236.5,-21.0,12.0,1374.0,32.0,2.0,27.0,1.0,0.0,1.0,2341990000.0,236.5
50%,482.5,26.0,18.0,2309.0,65.0,3.0,33.0,1.0,0.0,1.0,2344832000.0,482.5
75%,738.25,130.5,24.0,3960.5,205.0,4.0,42.0,2.0,1.0,1.0,2347731000.0,738.25
max,998.0,999.0,72.0,18424.0,19970.0,4.0,75.0,4.0,1.0,2.0,2349986000.0,998.0


In [7]:
sample_submission.head()

Unnamed: 0,Id,default
0,2,0
1,4,1
2,6,1
3,13,1
4,18,1


In [8]:
train.isnull().sum().sort_values(ascending = False)

telephone               408
checking_balance        281
personal_status         210
savings_balance         129
residence_history        87
employment_length        47
Unnamed: 0                0
housing                   0
gender                    0
job                       0
foreign_worker            0
dependents                0
default                   0
existing_credits          0
property                  0
installment_plan          0
age                       0
other_debtors             0
installment_rate          0
amount                    0
purpose                   0
credit_history            0
months_loan_duration      0
Id                        0
dtype: int64

In [9]:
train.head()

Unnamed: 0.1,Unnamed: 0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_length,installment_rate,personal_status,...,installment_plan,housing,existing_credits,default,dependents,telephone,foreign_worker,job,gender,Id
0,0,-43.0,6,critical,radio/tv,1169,,13 years,4,single,...,none,own,2,0,1,2349340000.0,yes,skilled employee,male,0
1,1,75.0,48,repaid,radio/tv,5951,89.0,2 years,2,,...,none,own,1,1,1,,yes,skilled employee,female,1
2,3,-32.0,42,repaid,furniture,7882,9.0,5 years,2,single,...,none,for free,1,0,2,,yes,skilled employee,male,3
3,5,,36,repaid,education,9055,,2 years,2,single,...,none,for free,1,0,2,2345788000.0,yes,unskilled resident,male,5
4,7,169.0,36,repaid,car (used),6948,57.0,2 years,2,single,...,none,rent,1,0,1,2349962000.0,yes,mangement self-employed,male,7


In [10]:
train.columns

Index(['Unnamed: 0', 'checking_balance', 'months_loan_duration',
       'credit_history', 'purpose', 'amount', 'savings_balance',
       'employment_length', 'installment_rate', 'personal_status',
       'other_debtors', 'residence_history', 'property', 'age',
       'installment_plan', 'housing', 'existing_credits', 'default',
       'dependents', 'telephone', 'foreign_worker', 'job', 'gender', 'Id'],
      dtype='object')

In [11]:
train['employment_length'].unique()

array(['13 years', '2 years', '5 years', '4 years', nan, '6 months',
       '5 months', '3 years', '1 years', '17 years', '3 months',
       '9 years', '4 months', '10 months', '11 years', '7 months',
       '19 years', '7 years', '14 years', '1 months', '18 years',
       '16 years', '15 years', '6 years', '8 years', '12 years',
       '2 months', '0 months', '9 months', '10 years', '8 months',
       '11 months'], dtype=object)

In [12]:
'13 years'.split()

['13', 'years']

In [13]:
def treat_emp_length(x):
    x = str(x).split()
    if x[0]== 'nan':
        return 0
    else:
        if x[1] == 'years':
            return int(x[0])*12
        else: 
            return int(x[0])
        

In [14]:
train['treated_emp_length'] = train['employment_length'].apply(treat_emp_length)

In [15]:
train['treated_emp_length']

0      156
1       24
2       60
3       24
4       24
      ... 
703      0
704    156
705    204
706    156
707     24
Name: treated_emp_length, Length: 708, dtype: int64

In [16]:
train['credit_history'].value_counts()


repaid                    373
critical                  209
delayed                    66
fully repaid this bank     34
fully repaid               26
Name: credit_history, dtype: int64

In [17]:
def merge_cred_hist(x):
    if x == "fully repaid this bank" or x == "fully repaid":
        return "fully repaid"
    else:
        return x

In [18]:
train['credit_history_new'] = train['credit_history'].apply(merge_cred_hist)

In [19]:
train['credit_history_new'].value_counts()

repaid          373
critical        209
delayed          66
fully repaid     60
Name: credit_history_new, dtype: int64

In [20]:
train['purpose'].value_counts()

radio/tv               199
car (new)              177
furniture              114
business                75
car (used)              72
education               37
repairs                 16
domestic appliances      8
others                   7
retraining               3
Name: purpose, dtype: int64

In [21]:
def merge_purpose(x):
    if x == "repairs" or x == "domestic appliances" or x == "others" or x == "retraining":
        return "others"
    else:
        return x

In [22]:
train['merge_purpose'] = train['purpose'].apply(merge_purpose)

In [23]:
train['merge_purpose'].value_counts()

radio/tv      199
car (new)     177
furniture     114
business       75
car (used)     72
education      37
others         34
Name: merge_purpose, dtype: int64

In [24]:
df_obj = train.select_dtypes(include=['object'])

cols = df_obj.columns

for i in cols:
      print(train[i].value_counts())

repaid                    373
critical                  209
delayed                    66
fully repaid this bank     34
fully repaid               26
Name: credit_history, dtype: int64
radio/tv               199
car (new)              177
furniture              114
business                75
car (used)              72
education               37
repairs                 16
domestic appliances      8
others                   7
retraining               3
Name: purpose, dtype: int64
1 years      86
2 years      80
3 years      74
7 years      41
5 years      35
4 years      27
6 years      26
13 years     20
11 years     18
19 years     17
14 years     15
6 months     15
10 years     15
8 years      15
7 months     14
18 years     14
12 years     14
4 months     14
9 years      13
3 months     13
15 years     11
2 months     11
16 years     10
9 months     10
8 months     10
0 months      9
17 years      9
1 months      7
10 months     7
5 months      6
11 months     5
Name: employment_leng

In [25]:
def treat_res_hist(x):
    x = str(x).split()
    if x[0]== 'nan':
        return 0
    else:
        if x[1] == 'years':
            return int(x[0])*12
        else: 
            return int(x[0])
       

In [26]:
 train['treated_residence_history'] = train['residence_history'].apply(treat_res_hist)

In [27]:
 train['treated_residence_history'].value_counts()

0      114
24      31
8       28
12      25
36      24
48      23
1       20
9       19
156     19
120     18
96      18
180     18
11      18
216     18
3       18
2       18
132     17
192     16
168     16
264     15
276     15
5       15
4       15
288     14
60      14
10      14
7       14
6       14
144     13
204     13
240     13
84      13
108     12
72      12
228     12
252     12
Name: treated_residence_history, dtype: int64

In [28]:
train.columns

Index(['Unnamed: 0', 'checking_balance', 'months_loan_duration',
       'credit_history', 'purpose', 'amount', 'savings_balance',
       'employment_length', 'installment_rate', 'personal_status',
       'other_debtors', 'residence_history', 'property', 'age',
       'installment_plan', 'housing', 'existing_credits', 'default',
       'dependents', 'telephone', 'foreign_worker', 'job', 'gender', 'Id',
       'treated_emp_length', 'credit_history_new', 'merge_purpose',
       'treated_residence_history'],
      dtype='object')

In [29]:
train['telephone'].isnull().sum()

408

In [30]:
def check_tele (x): 
    if x > 0:
        return 0
    else:
        return 1

In [31]:
train['tele_check'] = train['telephone'].apply(check_tele)

In [32]:
train[['tele_check','telephone']]

Unnamed: 0,tele_check,telephone
0,0,2.349340e+09
1,1,
2,1,
3,0,2.345788e+09
4,0,2.349962e+09
...,...,...
703,0,2.341421e+09
704,1,
705,0,2.341832e+09
706,1,


In [33]:
train.drop(['Unnamed: 0', 'credit_history', 'purpose', 
       'employment_length', 'residence_history', 'telephone', 'Id'], axis = 1, inplace = True)

In [34]:
train.columns

Index(['checking_balance', 'months_loan_duration', 'amount', 'savings_balance',
       'installment_rate', 'personal_status', 'other_debtors', 'property',
       'age', 'installment_plan', 'housing', 'existing_credits', 'default',
       'dependents', 'foreign_worker', 'job', 'gender', 'treated_emp_length',
       'credit_history_new', 'merge_purpose', 'treated_residence_history',
       'tele_check'],
      dtype='object')

In [35]:
train.isnull().sum()

checking_balance             281
months_loan_duration           0
amount                         0
savings_balance              129
installment_rate               0
personal_status              210
other_debtors                  0
property                       0
age                            0
installment_plan               0
housing                        0
existing_credits               0
default                        0
dependents                     0
foreign_worker                 0
job                            0
gender                         0
treated_emp_length             0
credit_history_new             0
merge_purpose                  0
treated_residence_history      0
tele_check                     0
dtype: int64

In [36]:
train['checking_balance'].describe()

count    427.000000
mean     103.133489
std      220.425812
min      -50.000000
25%      -21.000000
50%       26.000000
75%      130.500000
max      999.000000
Name: checking_balance, dtype: float64

In [37]:
train.shape

(708, 22)

In [38]:
train['checking_balance'].fillna(-9999, inplace = True)

In [39]:
train[train['savings_balance'] == 0]

Unnamed: 0,checking_balance,months_loan_duration,amount,savings_balance,installment_rate,personal_status,other_debtors,property,age,installment_plan,...,default,dependents,foreign_worker,job,gender,treated_emp_length,credit_history_new,merge_purpose,treated_residence_history,tele_check
248,199.0,13,882,0.0,4,single,guarantor,real estate,23,none,...,0,1,yes,skilled employee,male,1,critical,radio/tv,276,1
266,-9999.0,18,3850,0.0,3,single,none,other,27,none,...,0,1,yes,skilled employee,male,60,critical,car (used),0,1
422,-41.0,18,2039,0.0,1,,none,real estate,20,bank,...,1,1,yes,skilled employee,female,36,repaid,furniture,108,1
436,-9999.0,36,3349,0.0,4,,none,other,28,none,...,1,1,yes,mangement self-employed,female,12,repaid,furniture,5,0
516,-9999.0,18,1505,0.0,4,single,none,unknown/none,32,none,...,0,1,yes,mangement self-employed,male,24,repaid,radio/tv,3,0
591,-36.0,21,1602,0.0,4,married,none,other,30,none,...,0,1,yes,skilled employee,male,144,critical,car (new),12,0
597,-39.0,12,1082,0.0,4,single,none,other,48,bank,...,1,1,yes,skilled employee,male,36,fully repaid,car (new),132,1


In [40]:
train['savings_balance'].fillna(-9999, inplace = True)

In [41]:
train['personal_status'].value_counts()

single      401
married      64
divorced     33
Name: personal_status, dtype: int64

In [42]:
train['personal_status'].fillna("missing", inplace = True)

In [43]:
train.isnull().sum()

checking_balance             0
months_loan_duration         0
amount                       0
savings_balance              0
installment_rate             0
personal_status              0
other_debtors                0
property                     0
age                          0
installment_plan             0
housing                      0
existing_credits             0
default                      0
dependents                   0
foreign_worker               0
job                          0
gender                       0
treated_emp_length           0
credit_history_new           0
merge_purpose                0
treated_residence_history    0
tele_check                   0
dtype: int64

In [44]:
from sklearn.preprocessing import OneHotEncoder

In [45]:
df_obj = train.select_dtypes(include = ['object'])
drop_enc = OneHotEncoder(drop = 'first') .fit(df_obj)
#drop_enc.transform(df_obj).toarray()

In [46]:
drop_enc = drop_enc.transform(df_obj)
#pd.DataFrame(drop_enc.toarray(), columns = drop_enc.categories_)

In [47]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.2.2-py2.py3-none-any.whl (80 kB)
Installing collected packages: category-encoders
Successfully installed category-encoders-2.2.2


In [48]:
from category_encoders.one_hot import OneHotEncoder

In [49]:
cols_encoding = train.select_dtypes(include='object').columns
ohe = OneHotEncoder(cols=cols_encoding)

In [50]:
X = train.drop('default', axis =1)
y = train['default']

In [51]:
encoded = ohe.fit_transform(X)

In [52]:
encoded

Unnamed: 0,checking_balance,months_loan_duration,amount,savings_balance,installment_rate,personal_status_1,personal_status_2,personal_status_3,personal_status_4,other_debtors_1,...,credit_history_new_4,merge_purpose_1,merge_purpose_2,merge_purpose_3,merge_purpose_4,merge_purpose_5,merge_purpose_6,merge_purpose_7,treated_residence_history,tele_check
0,-43.0,6,1169,-9999.0,4,1,0,0,0,1,...,0,1,0,0,0,0,0,0,72,0
1,75.0,48,5951,89.0,2,0,1,0,0,1,...,0,1,0,0,0,0,0,0,5,1
2,-32.0,42,7882,9.0,2,1,0,0,0,0,...,0,0,1,0,0,0,0,0,156,1
3,-9999.0,36,9055,-9999.0,2,1,0,0,0,1,...,0,0,0,1,0,0,0,0,96,0
4,169.0,36,6948,57.0,2,1,0,0,0,1,...,0,0,0,0,1,0,0,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
703,-12.0,24,6579,21.0,4,1,0,0,0,1,...,0,0,0,0,1,0,0,0,10,0
704,97.0,24,1743,40.0,4,1,0,0,0,1,...,0,1,0,0,0,0,0,0,4,1
705,-9999.0,12,2390,-9999.0,4,1,0,0,0,1,...,0,0,0,0,0,1,0,0,12,0
706,-9999.0,12,804,44.0,4,1,0,0,0,1,...,0,1,0,0,0,0,0,0,192,1


In [53]:
encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 708 entries, 0 to 707
Data columns (total 47 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   checking_balance           708 non-null    float64
 1   months_loan_duration       708 non-null    int64  
 2   amount                     708 non-null    int64  
 3   savings_balance            708 non-null    float64
 4   installment_rate           708 non-null    int64  
 5   personal_status_1          708 non-null    int64  
 6   personal_status_2          708 non-null    int64  
 7   personal_status_3          708 non-null    int64  
 8   personal_status_4          708 non-null    int64  
 9   other_debtors_1            708 non-null    int64  
 10  other_debtors_2            708 non-null    int64  
 11  other_debtors_3            708 non-null    int64  
 12  property_1                 708 non-null    int64  
 13  property_2                 708 non-null    int64  

In [54]:
#test processing

In [55]:
def treat_emp_length(x):
    x = str(x).split()
    if x[0]== 'nan':
        return 0
    else:
        if x[1] == 'years':
            return int(x[0])*12
        else: 
            return int(x[0])

In [56]:
test['treated_emp_length'] = test['employment_length'].apply(treat_emp_length)

In [57]:
test['credit_history_new'] = test['credit_history'].apply(merge_cred_hist)

In [58]:
test['merge_purpose'] = test['purpose'].apply(merge_purpose)

In [59]:
 test['treated_residence_history'] = test['residence_history'].apply(treat_res_hist)

In [60]:
test['tele_check'] = test['telephone'].apply(check_tele)

In [61]:
test.drop(['Unnamed: 0', 'credit_history', 'purpose', 
       'employment_length', 'residence_history', 'telephone', 'Id'], axis = 1, inplace = True)

In [62]:
test['checking_balance'].fillna(-9999, inplace = True)

In [63]:
test['savings_balance'].fillna(-9999, inplace = True)

In [64]:
test['personal_status'].fillna("missing", inplace = True)

In [65]:
encoded_test = ohe.transform(test)

In [66]:
encoded_test

Unnamed: 0,checking_balance,months_loan_duration,amount,savings_balance,installment_rate,personal_status_1,personal_status_2,personal_status_3,personal_status_4,other_debtors_1,...,credit_history_new_4,merge_purpose_1,merge_purpose_2,merge_purpose_3,merge_purpose_4,merge_purpose_5,merge_purpose_6,merge_purpose_7,treated_residence_history,tele_check
0,-9999.0,12,2096,24.0,2,1,0,0,0,1,...,0,0,0,1,0,0,0,0,48,1
1,-23.0,24,4870,43.0,3,1,0,0,0,1,...,1,0,0,0,0,1,0,0,156,1
2,-9999.0,24,2835,761.0,3,1,0,0,0,1,...,0,0,1,0,0,0,0,0,144,1
3,-36.0,24,1199,32.0,4,1,0,0,0,1,...,0,0,0,0,0,1,0,0,288,1
4,83.0,24,12579,16.0,4,0,1,0,0,1,...,0,0,0,0,1,0,0,0,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287,-37.0,18,1936,-9999.0,2,0,0,0,1,1,...,0,1,0,0,0,0,0,0,252,1
288,-18.0,36,3959,75.0,4,1,0,0,0,1,...,0,0,1,0,0,0,0,0,36,0
289,-9999.0,12,1736,48.0,3,0,1,0,0,1,...,0,0,1,0,0,0,0,0,240,1
290,-30.0,30,3857,20.0,4,0,0,1,0,1,...,0,0,0,0,1,0,0,0,60,0


In [67]:
#Model Building

In [68]:
from sklearn.model_selection import train_test_split

In [69]:
X_train, X_test, y_train, y_test = train_test_split(encoded, y, test_size=0.2, random_state=42)

In [70]:
from sklearn.ensemble import RandomForestClassifier

In [71]:
model = RandomForestClassifier()

In [72]:
model.fit(X_train, y_train)

RandomForestClassifier()

In [73]:
model.score(X_train, y_train)

1.0

In [74]:
predict = model.predict(X_test)

In [75]:
from sklearn.metrics import classification_report

In [76]:
print (classification_report(predict, y_test))

              precision    recall  f1-score   support

           0       0.91      0.79      0.85       122
           1       0.30      0.55      0.39        20

    accuracy                           0.75       142
   macro avg       0.61      0.67      0.62       142
weighted avg       0.83      0.75      0.78       142



In [77]:
model.fit(encoded, y)

RandomForestClassifier()

In [78]:
model.score(encoded,y)

1.0

In [79]:
predict = model.predict(encoded_test)

In [80]:
predict

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1], dtype=int64)