## Import Libraries

In [1]:
import numpy as np
from numpy import count_nonzero, median, mean
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

# Plotly
# import plotly.express as px
# import plotly.offline as py
# import plotly.graph_objs as go

import sweetviz

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.formula.api import ols
# import researchpy as rp

import datetime
from datetime import datetime, timedelta

# import eli5
# from IPython.display import display

#import os
#import zipfile
import scipy.stats
from collections import Counter

import sklearn
# from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
# from sklearn.linear_model import LinearRegression, LogisticRegression, ElasticNet, Lasso, Ridge
# from sklearn.model_selection import cross_val_score, train_test_split
# from sklearn.metrics import accuracy_score, auc, classification_report, confusion_matrix, f1_score
# from sklearn.metrics import plot_confusion_matrix, plot_roc_curve

# from sklearn.linear_model import ElasticNet, Lasso, LinearRegression, LogisticRegression, Ridge
# from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, ExtraTreeClassifier, ExtraTreeRegressor, plot_tree
# from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR
# from sklearn.naive_bayes import GaussianNB, MultinomialNB

%matplotlib inline
#sets the default autosave frequency in seconds
%autosave 60 
sns.set_style('dark')
sns.set(font_scale=1.2)

plt.rc('axes', titlesize=9)
plt.rc('axes', labelsize=14)
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)

import warnings
warnings.filterwarnings('ignore')

# Use Feature-Engine library
#import feature_engine
#from feature_engine import imputation as mdi
#from feature_engine.outlier_removers import Winsorizer
#from feature_engine import categorical_encoders as ce
#from feature_engine.discretisation import EqualWidthDiscretiser, EqualFrequencyDiscretiser
#from feature_engine.discretisation import ArbitraryDiscretiser, DecisionTreeDiscretiser
#from feature_engine.encoding import OrdinalEncoder

pd.set_option('display.max_columns',None)
#pd.set_option('display.max_rows',None)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format','{:.2f}'.format)

random.seed(0)
np.random.seed(0)
np.set_printoptions(suppress=True)

Autosaving every 60 seconds


## Exploratory Data Analysis

In [2]:
df = pd.read_csv("DATA_3.01_CREDIT.csv")

In [3]:
df.head()

Unnamed: 0,Income,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance
0,14.89,283,2,34,11,Male,No,Yes,Caucasian,333
1,106.03,483,3,82,15,Female,Yes,Yes,Asian,903
2,104.59,514,4,71,11,Male,No,No,Asian,580
3,148.92,681,3,36,11,Female,No,No,Asian,964
4,55.88,357,2,68,16,Male,No,Yes,Caucasian,331


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Income     300 non-null    float64
 1   Rating     300 non-null    int64  
 2   Cards      300 non-null    int64  
 3   Age        300 non-null    int64  
 4   Education  300 non-null    int64  
 5   Gender     300 non-null    object 
 6   Student    300 non-null    object 
 7   Married    300 non-null    object 
 8   Ethnicity  300 non-null    object 
 9   Balance    300 non-null    int64  
dtypes: float64(1), int64(5), object(4)
memory usage: 23.6+ KB


In [5]:
df.describe()

Unnamed: 0,Income,Rating,Cards,Age,Education,Balance
count,300.0,300.0,300.0,300.0,300.0,300.0
mean,44.05,348.12,3.03,54.98,13.39,502.69
std,33.86,150.87,1.35,17.22,3.08,466.99
min,10.35,93.0,1.0,24.0,5.0,0.0
25%,21.03,235.0,2.0,41.0,11.0,15.75
50%,33.12,339.0,3.0,55.0,14.0,433.5
75%,55.98,433.0,4.0,69.0,16.0,857.75
max,186.63,949.0,8.0,91.0,20.0,1809.0


### Correlation

In [6]:
df.corr()

Unnamed: 0,Income,Rating,Cards,Age,Education,Balance
Income,1.0,0.77,0.03,0.12,-0.07,0.43
Rating,0.77,1.0,0.1,0.04,-0.1,0.86
Cards,0.03,0.1,1.0,0.05,0.02,0.12
Age,0.12,0.04,0.05,1.0,-0.05,-0.05
Education,-0.07,-0.1,0.02,-0.05,1.0,-0.07
Balance,0.43,0.86,0.12,-0.05,-0.07,1.0


In [7]:
df.corr()["Balance"].sort_values()

Education   -0.07
Age         -0.05
Cards        0.12
Income       0.43
Rating       0.86
Balance      1.00
Name: Balance, dtype: float64

In [8]:
df.Gender.value_counts()

Female    168
 Male     132
Name: Gender, dtype: int64

In [9]:
df.Student.value_counts()

No     268
Yes     32
Name: Student, dtype: int64

In [10]:
df.Married.value_counts()

Yes    183
No     117
Name: Married, dtype: int64

In [11]:
df.Ethnicity.value_counts()

Caucasian           141
Asian                81
African American     78
Name: Ethnicity, dtype: int64

### Treat Missing Values

In [12]:
df.isnull().sum()

Income       0
Rating       0
Cards        0
Age          0
Education    0
Gender       0
Student      0
Married      0
Ethnicity    0
Balance      0
dtype: int64

In [13]:
dfGender = pd.get_dummies(df.Gender, prefix="Gender", drop_first=True)

In [14]:
dfGender

Unnamed: 0,Gender_Female
0,0
1,1
2,0
3,1
4,0
...,...
295,1
296,1
297,0
298,1


In [15]:
dfStudent = pd.get_dummies(df.Student, prefix="Student", drop_first=True)

In [16]:
dfStudent

Unnamed: 0,Student_Yes
0,0
1,1
2,0
3,0
4,0
...,...
295,0
296,0
297,0
298,0


In [17]:
dfMarried = pd.get_dummies(df.Married, prefix="Married", drop_first=True)

In [18]:
dfMarried

Unnamed: 0,Married_Yes
0,1
1,1
2,0
3,0
4,1
...,...
295,1
296,1
297,1
298,0


In [19]:
dfEthnicity = pd.get_dummies(df.Ethnicity, prefix="Ethnicity", drop_first=False)

In [20]:
dfEthnicity

Unnamed: 0,Ethnicity_African American,Ethnicity_Asian,Ethnicity_Caucasian
0,0,0,1
1,0,1,0
2,0,1,0
3,0,1,0
4,0,0,1
...,...,...,...
295,0,0,1
296,0,0,1
297,0,1,0
298,1,0,0


### Drop unwanted features

In [21]:
df.columns

Index(['Income', 'Rating', 'Cards', 'Age', 'Education', 'Gender', 'Student', 'Married', 'Ethnicity', 'Balance'], dtype='object')

In [22]:
df.drop(['Gender', 'Student', 'Married', 'Ethnicity'], axis=1, inplace=True)

In [23]:
df.head(1)

Unnamed: 0,Income,Rating,Cards,Age,Education,Balance
0,14.89,283,2,34,11,333


In [24]:
df2 = pd.concat([dfGender,dfStudent,dfMarried, dfEthnicity, df], axis=1)

In [25]:
df2

Unnamed: 0,Gender_Female,Student_Yes,Married_Yes,Ethnicity_African American,Ethnicity_Asian,Ethnicity_Caucasian,Income,Rating,Cards,Age,Education,Balance
0,0,0,1,0,0,1,14.89,283,2,34,11,333
1,1,1,1,0,1,0,106.03,483,3,82,15,903
2,0,0,0,0,1,0,104.59,514,4,71,11,580
3,1,0,0,0,1,0,148.92,681,3,36,11,964
4,0,0,1,0,0,1,55.88,357,2,68,16,331
...,...,...,...,...,...,...,...,...,...,...,...,...
295,1,0,1,0,0,1,27.27,149,5,67,10,0
296,1,0,1,0,0,1,65.90,370,1,49,17,293
297,0,0,1,0,1,0,55.05,321,3,74,17,188
298,1,0,0,1,0,0,20.79,204,1,70,18,0


## Regression Analysis

### Linear Regression (StatsModel)

In [26]:
df2.columns

Index(['Gender_Female', 'Student_Yes', 'Married_Yes', 'Ethnicity_African American', 'Ethnicity_Asian', 'Ethnicity_Caucasian', 'Income', 'Rating', 'Cards', 'Age', 'Education', 'Balance'], dtype='object')

In [27]:
y = df2[['Rating']]
X = df2[['Gender_Female', 'Student_Yes', 'Married_Yes', 'Ethnicity_African American', 'Ethnicity_Asian', 'Ethnicity_Caucasian', 'Income', 'Balance', 'Cards', 'Age', 'Education']]

In [28]:
X = sm.add_constant(X)

In [29]:
model = sm.OLS(y,X).fit()

In [30]:
model.summary()

0,1,2,3
Dep. Variable:,Rating,R-squared:,0.974
Model:,OLS,Adj. R-squared:,0.973
Method:,Least Squares,F-statistic:,1067.0
Date:,"Thu, 08 Dec 2022",Prob (F-statistic):,1.17e-221
Time:,18:36:37,Log-Likelihood:,-1384.7
No. Observations:,300,AIC:,2791.0
Df Residuals:,289,BIC:,2832.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,104.2413,6.945,15.009,0.000,90.571,117.911
Gender_Female,1.7704,2.918,0.607,0.544,-3.973,7.513
Student_Yes,-98.8048,4.960,-19.921,0.000,-108.567,-89.043
Married_Yes,3.1769,3.006,1.057,0.291,-2.739,9.092
Ethnicity_African American,36.6401,3.319,11.041,0.000,30.108,43.172
Ethnicity_Asian,32.2118,3.235,9.958,0.000,25.845,38.579
Ethnicity_Caucasian,35.3895,2.865,12.351,0.000,29.750,41.029
Income,2.0947,0.048,43.533,0.000,2.000,2.189
Balance,0.2314,0.004,63.189,0.000,0.224,0.239

0,1,2,3
Omnibus:,42.495,Durbin-Watson:,1.854
Prob(Omnibus):,0.0,Jarque-Bera (JB):,56.375
Skew:,-0.987,Prob(JB):,5.73e-13
Kurtosis:,3.784,Cond. No.,5.61e+18


-------

In [31]:
df3 = pd.read_csv("DATA_3.02_HR2.csv")

In [32]:
df3.head()

Unnamed: 0,S,LPE,NP,ANH,TIC,Newborn,left
0,0.38,0.53,2,157,3,0,1
1,0.8,0.86,5,262,6,0,1
2,0.11,0.88,7,272,4,0,1
3,0.72,0.87,5,223,5,0,1
4,0.37,0.52,2,159,3,0,1


In [33]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   S        12000 non-null  float64
 1   LPE      12000 non-null  float64
 2   NP       12000 non-null  int64  
 3   ANH      12000 non-null  int64  
 4   TIC      12000 non-null  int64  
 5   Newborn  12000 non-null  int64  
 6   left     12000 non-null  int64  
dtypes: float64(2), int64(5)
memory usage: 656.4 KB


In [34]:
df3.describe()

Unnamed: 0,S,LPE,NP,ANH,TIC,Newborn,left
count,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0
mean,0.63,0.72,3.8,200.44,3.23,0.15,0.17
std,0.24,0.17,1.16,48.74,1.06,0.36,0.37
min,0.09,0.36,2.0,96.0,2.0,0.0,0.0
25%,0.48,0.57,3.0,157.0,2.0,0.0,0.0
50%,0.66,0.72,4.0,199.5,3.0,0.0,0.0
75%,0.82,0.86,5.0,243.0,4.0,0.0,0.0
max,1.0,1.0,7.0,310.0,6.0,1.0,1.0


In [35]:
df3.columns

Index(['S', 'LPE', 'NP', 'ANH', 'TIC', 'Newborn', 'left'], dtype='object')

In [36]:
y = df3[['left']]
X = df3[['S', 'LPE', 'NP', 'ANH', 'TIC']]

## Logistic Regression (StatsModel)

In [37]:
X = sm.add_constant(X)

In [38]:
model = sm.Logit(y, X).fit()

Optimization terminated successfully.
         Current function value: 0.364214
         Iterations 7


In [39]:
model.summary()

0,1,2,3
Dep. Variable:,left,No. Observations:,12000.0
Model:,Logit,Df Residuals:,11994.0
Method:,MLE,Df Model:,5.0
Date:,"Thu, 08 Dec 2022",Pseudo R-squ.:,0.1916
Time:,18:36:37,Log-Likelihood:,-4370.6
converged:,True,LL-Null:,-5406.7
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.3623,0.160,-8.493,0.000,-1.677,-1.048
S,-3.7908,0.119,-31.751,0.000,-4.025,-3.557
LPE,0.4880,0.178,2.745,0.006,0.140,0.837
NP,-0.3517,0.026,-13.488,0.000,-0.403,-0.301
ANH,0.0039,0.001,6.385,0.000,0.003,0.005
TIC,0.5955,0.027,22.424,0.000,0.543,0.648


In [40]:
----------

SyntaxError: invalid syntax (1776373828.py, line 1)

### Logistic Regression (SKLearn)

In [None]:
df.shape

In [None]:
X = df.iloc[:,:4]
y = df.iloc[:,4]

In [None]:
Counter(y)

In [None]:
X.values, y.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.2, random_state=0, stratify=y)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
Counter(y_train), Counter(y_test)

In [None]:
lr = LogisticRegression(random_state=0)

In [None]:
lr.fit(X_train,y_train)

In [None]:
lr.coef_

In [None]:
lr.intercept_

In [None]:
y_pred = lr.predict(X_test)

In [None]:
y_pred

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
cm = confusion_matrix(y_test,y_pred)
cm

In [None]:
plot_confusion_matrix(estimator=lr, X=X_test, y_true=y_test, cmap='YlGnBu')
plt.show()

In [None]:
plot_roc_curve(estimator=lr, X=X_test, y=y_test)
plt.show()

#### Python code done by Dennis Lam