#### IMport libraries

In [1]:

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)



#### Fetch Training data file

In [2]:
df=pd.read_csv('cs-training.csv')

#### Explore data

In [3]:
df.columns

Index(['Unnamed: 0', 'SeriousDlqin2yrs',
       'RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents'],
      dtype='object')

In [4]:
df.drop('Unnamed: 0',axis=1,inplace=True)

In [5]:
df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [6]:
df.shape

(150000, 11)

In [7]:
df.isna().sum()

SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64

In [8]:
x=df['MonthlyIncome'].median()
df['MonthlyIncome']=df['MonthlyIncome'].fillna(x)

In [9]:
x=df['NumberOfDependents'].median()
df['NumberOfDependents']=df['NumberOfDependents'].fillna(x)

In [10]:
# To remove outliers
df.drop(df[df['age']<18].index,inplace=True)
df.drop(df[df['MonthlyIncome']>1000000].index,inplace=True)
df.drop(df[df["DebtRatio"]>23000].index,inplace=True)
df.drop(df[df['RevolvingUtilizationOfUnsecuredLines']>15000].index,inplace=True)

In [11]:
"""
plt.figure(figsize=(10,10))
plt.scatter(x=df['RevolvingUtilizationOfUnsecuredLines'],y=df['MonthlyIncome'])
plt.xlabel('RevolvingUtilizationOfUnsecuredLines')
plt.ylabel('MonthlyIncome')

plt.show()
"""

"\nplt.figure(figsize=(10,10))\nplt.scatter(x=df['RevolvingUtilizationOfUnsecuredLines'],y=df['MonthlyIncome'])\nplt.xlabel('RevolvingUtilizationOfUnsecuredLines')\nplt.ylabel('MonthlyIncome')\n\nplt.show()\n"

In [12]:
df1=df.copy()
df1['TotalPastDueDays']=df1['NumberOfTimes90DaysLate']+df1['NumberOfTime30-59DaysPastDueNotWorse']+df1['NumberOfTime60-89DaysPastDueNotWorse']
df1.drop(['NumberOfTimes90DaysLate','NumberOfTime60-89DaysPastDueNotWorse','NumberOfTime30-59DaysPastDueNotWorse'],axis=1,inplace=True)

In [13]:
predictor_var=df1[['RevolvingUtilizationOfUnsecuredLines','age','DebtRatio','MonthlyIncome','NumberOfOpenCreditLinesAndLoans','NumberRealEstateLoansOrLines','NumberOfDependents','TotalPastDueDays']]
response_var=df1['SeriousDlqin2yrs']

predictor_train,predictor_test,response_train,response_test = train_test_split(predictor_var,response_var,test_size=0.3,random_state=0)

In [14]:
log_regression=LogisticRegression(max_iter=2000)
log_regression.fit(predictor_train,response_train)

In [15]:
response_pred=log_regression.predict(predictor_test)

In [16]:
print(metrics.confusion_matrix(response_test,response_pred))

[[42104    42]
 [ 2805    36]]


In [17]:
print(metrics.accuracy_score(response_test,response_pred))

0.9367150510147376


### With Training & Testing File

In [18]:
test_df=pd.read_csv('cs-test.csv')
test_df['TotalPastDueDays']=test_df['NumberOfTimes90DaysLate']+test_df['NumberOfTime30-59DaysPastDueNotWorse']+test_df['NumberOfTime60-89DaysPastDueNotWorse']
test_df.drop(['NumberOfTimes90DaysLate','NumberOfTime60-89DaysPastDueNotWorse','NumberOfTime30-59DaysPastDueNotWorse'],axis=1,inplace=True)
test_df.drop(['Unnamed: 0','SeriousDlqin2yrs'],axis=1,inplace=True)
test_df['NumberOfDependents']=test_df['NumberOfDependents'].fillna(test_df['NumberOfDependents'].median())
test_df['MonthlyIncome']=test_df['MonthlyIncome'].fillna(test_df['MonthlyIncome'].median())

test_df.isna().sum()

RevolvingUtilizationOfUnsecuredLines    0
age                                     0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberRealEstateLoansOrLines            0
NumberOfDependents                      0
TotalPastDueDays                        0
dtype: int64

In [19]:
log1=LogisticRegression(max_iter=2000)
log1.fit(predictor_var,response_var)
response_pred=log1.predict(test_df[['RevolvingUtilizationOfUnsecuredLines','age','DebtRatio','MonthlyIncome','NumberOfOpenCreditLinesAndLoans','NumberRealEstateLoansOrLines','NumberOfDependents','TotalPastDueDays']])

In [20]:
prob=pd.read_csv('sampleEntry.csv')

def  encoding(x):
    if x>=0.5:
        return 1
    return 0
prob=(prob['Probability']).tolist()
prob=[encoding(x) for x in prob]
probability=pd.Series(prob)


In [21]:
print(metrics.confusion_matrix(probability,response_pred))

[[99712    86]
 [ 1577   128]]


In [22]:
print(metrics.accuracy_score(probability,response_pred))

0.9836162477956316
