#### IMport libraries

In [1]:

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)



In [2]:
import statsmodels.api as sm

#### Fetch Training data file

In [3]:
df=pd.read_csv('cs-training.csv')

#### Explore data

In [4]:
df.columns

Index(['Unnamed: 0', 'SeriousDlqin2yrs',
       'RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents'],
      dtype='object')

In [5]:
df.drop('Unnamed: 0',axis=1,inplace=True)

In [6]:
df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [7]:
df.shape

(150000, 11)

In [8]:
df.isna().sum()

SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64

In [9]:
x=df['MonthlyIncome'].median()
df['MonthlyIncome']=df['MonthlyIncome'].fillna(x)

In [10]:
x=df['NumberOfDependents'].median()
df['NumberOfDependents']=df['NumberOfDependents'].fillna(x)

In [11]:
# To remove outliers
df.drop(df[df['age']<18].index,inplace=True)
df.drop(df[df['MonthlyIncome']>1000000].index,inplace=True)
df.drop(df[df["DebtRatio"]>23000].index,inplace=True)
df.drop(df[df['RevolvingUtilizationOfUnsecuredLines']>15000].index,inplace=True)

In [12]:
"""
plt.figure(figsize=(10,10))
plt.scatter(x=df['RevolvingUtilizationOfUnsecuredLines'],y=df['MonthlyIncome'])
plt.xlabel('RevolvingUtilizationOfUnsecuredLines')
plt.ylabel('MonthlyIncome')

plt.show()
"""

"\nplt.figure(figsize=(10,10))\nplt.scatter(x=df['RevolvingUtilizationOfUnsecuredLines'],y=df['MonthlyIncome'])\nplt.xlabel('RevolvingUtilizationOfUnsecuredLines')\nplt.ylabel('MonthlyIncome')\n\nplt.show()\n"

In [13]:
df1=df.copy()
df1['TotalPastDueDays']=df1['NumberOfTimes90DaysLate']+df1['NumberOfTime30-59DaysPastDueNotWorse']+df1['NumberOfTime60-89DaysPastDueNotWorse']
df1.drop(['NumberOfTimes90DaysLate','NumberOfTime60-89DaysPastDueNotWorse','NumberOfTime30-59DaysPastDueNotWorse'],axis=1,inplace=True)

In [14]:
predictor_var=df1[['RevolvingUtilizationOfUnsecuredLines','age','DebtRatio','MonthlyIncome','NumberOfOpenCreditLinesAndLoans','NumberRealEstateLoansOrLines','NumberOfDependents','TotalPastDueDays']]
#predictor_var=df1[['RevolvingUtilizationOfUnsecuredLines','age','NumberOfOpenCreditLinesAndLoans','NumberRealEstateLoansOrLines','NumberOfDependents','TotalPastDueDays']]
#predictor_var=df1[['age','NumberOfOpenCreditLinesAndLoans','NumberRealEstateLoansOrLines','NumberOfDependents','TotalPastDueDays']]
#predictor_var=df1[['age','NumberOfDependents','TotalPastDueDays']]
#predictor_var=df1[['NumberOfDependents','TotalPastDueDays']]


response_var=df1['SeriousDlqin2yrs']



In [15]:
predictor_train,predictor_test,response_train,response_test = train_test_split(predictor_var,response_var,test_size=0.3,random_state=0)

## Logistic Regression

In [16]:
log_regression=LogisticRegression(max_iter=2000)
log_regression.fit(predictor_train,response_train)

In [17]:
response_pred=log_regression.predict(predictor_test)

In [18]:
print(metrics.confusion_matrix(response_test,response_pred))

[[42104    42]
 [ 2805    36]]


In [19]:
print(metrics.accuracy_score(response_test,response_pred))

0.9367150510147376


### With Training & Testing File

In [20]:
test_df=pd.read_csv('cs-test.csv')
test_df['TotalPastDueDays']=test_df['NumberOfTimes90DaysLate']+test_df['NumberOfTime30-59DaysPastDueNotWorse']+test_df['NumberOfTime60-89DaysPastDueNotWorse']
test_df.drop(['NumberOfTimes90DaysLate','NumberOfTime60-89DaysPastDueNotWorse','NumberOfTime30-59DaysPastDueNotWorse'],axis=1,inplace=True)
test_df.drop(['Unnamed: 0','SeriousDlqin2yrs'],axis=1,inplace=True)
test_df['NumberOfDependents']=test_df['NumberOfDependents'].fillna(test_df['NumberOfDependents'].median())
test_df['MonthlyIncome']=test_df['MonthlyIncome'].fillna(test_df['MonthlyIncome'].median())

test_df.isna().sum()

RevolvingUtilizationOfUnsecuredLines    0
age                                     0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberRealEstateLoansOrLines            0
NumberOfDependents                      0
TotalPastDueDays                        0
dtype: int64

In [21]:
log1=LogisticRegression(max_iter=2000)
log1.fit(predictor_var,response_var)
response_pred=log1.predict(test_df[['RevolvingUtilizationOfUnsecuredLines','age','DebtRatio','MonthlyIncome','NumberOfOpenCreditLinesAndLoans','NumberRealEstateLoansOrLines','NumberOfDependents','TotalPastDueDays']])

In [22]:
prob=pd.read_csv('sampleEntry.csv')

def  encoding(x):
    if x>=0.5:
        return 1
    return 0
prob=(prob['Probability']).tolist()
prob=[encoding(x) for x in prob]
probability=pd.Series(prob)


In [23]:
print(metrics.confusion_matrix(probability,response_pred))

[[99712    86]
 [ 1577   128]]


In [24]:
print(metrics.accuracy_score(probability,response_pred))

0.9836162477956316


In [25]:
print(log1.class_weight)

None


## Weights of Parameters

In [26]:
odds_ratios = np.exp(log_regression.coef_[0])
feature_odds_ratios = pd.DataFrame({
    'Feature': predictor_train.columns,
    'Odds Ratio': odds_ratios
})
print(feature_odds_ratios)


                                Feature  Odds Ratio
0  RevolvingUtilizationOfUnsecuredLines    0.999899
1                                   age    0.971577
2                             DebtRatio    0.999953
3                         MonthlyIncome    0.999957
4       NumberOfOpenCreditLinesAndLoans    0.996730
5          NumberRealEstateLoansOrLines    1.062642
6                    NumberOfDependents    1.126925
7                      TotalPastDueDays    1.011752


In [27]:
odds_ratios = np.exp(log1.coef_[0])
feature_odds_ratios = pd.DataFrame({
    'Feature': predictor_train.columns,
    'Odds Ratio': odds_ratios
})
print(feature_odds_ratios)



                                Feature  Odds Ratio
0  RevolvingUtilizationOfUnsecuredLines    0.999974
1                                   age    0.970819
2                             DebtRatio    0.999962
3                         MonthlyIncome    0.999961
4       NumberOfOpenCreditLinesAndLoans    0.997905
5          NumberRealEstateLoansOrLines    1.065553
6                    NumberOfDependents    1.123342
7                      TotalPastDueDays    1.011337


## Linear Regression


In [28]:

predictor_var=sm.add_constant(predictor_var)
model=sm.OLS(response_var,predictor_var).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:       SeriousDlqin2yrs   R-squared:                       0.026
Model:                            OLS   Adj. R-squared:                  0.026
Method:                 Least Squares   F-statistic:                     508.7
Date:                Wed, 30 Oct 2024   Prob (F-statistic):               0.00
Time:                        07:17:19   Log-Likelihood:                -2727.3
No. Observations:              149955   AIC:                             5473.
Df Residuals:                  149946   BIC:                             5562.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                                           coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------


## Neural Network

In [31]:

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


In [35]:
# Build the neural network model
model = Sequential()
model.add(Dense(64, input_shape=(predictor_train.shape[1],), activation='relu'))  # Input layer
model.add(Dense(32, activation='relu'))  # Hidden layer
model.add(Dense(16, activation='relu'))  # Hidden layer
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification


history = model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


test_loss, test_accuracy = model.evaluate(predictor_test, response_test)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)


# Plot accuracy
plt.plot(history['accuracy'], label='Train Accuracy')
plt.plot(history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Plot loss
plt.plot(history['loss'], label='Train Loss')
plt.plot(history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


AttributeError: 'NoneType' object has no attribute 'evaluate'