In [125]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction import DictVectorizer

In [126]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AER_credit_card_data.csv

--2022-10-05 15:49:16--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AER_credit_card_data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 73250 (72K) [text/plain]
Saving to: 'AER_credit_card_data.csv.3'

     0K .......... .......... .......... .......... .......... 69% 1.23M 0s
    50K .......... .......... .                               100%  671K=0.07s

2022-10-05 15:49:17 (998 KB/s) - 'AER_credit_card_data.csv.3' saved [73250/73250]



In [127]:
credit_card = pd.read_csv('credit_card.csv', verbose=True)

Tokenization took: 2.02 ms
Type conversion took: 1.98 ms
Parser memory cleanup took: 0.00 ms


In [128]:
credit_card.head()

Unnamed: 0,card,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,yes,0,37.66667,4.52,0.03327,124.9833,yes,no,3,54,1,12
1,yes,0,33.25,2.42,0.005217,9.854167,no,no,3,34,1,13
2,yes,0,33.66667,4.5,0.004156,15.0,yes,no,4,58,1,5
3,yes,0,30.5,2.54,0.065214,137.8692,no,no,0,25,1,7
4,yes,0,32.16667,9.7867,0.067051,546.5033,yes,no,2,64,1,5


In [129]:
credit_card.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319 entries, 0 to 1318
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   card         1319 non-null   object 
 1   reports      1319 non-null   int64  
 2   age          1319 non-null   float64
 3   income       1319 non-null   float64
 4   share        1319 non-null   float64
 5   expenditure  1319 non-null   float64
 6   owner        1319 non-null   object 
 7   selfemp      1319 non-null   object 
 8   dependents   1319 non-null   int64  
 9   months       1319 non-null   int64  
 10  majorcards   1319 non-null   int64  
 11  active       1319 non-null   int64  
dtypes: float64(4), int64(5), object(3)
memory usage: 123.8+ KB


In [130]:
credit_card.card.value_counts()

yes    1023
no      296
Name: card, dtype: int64

In [131]:
for index in range(len(credit_card['card'])):
    if credit_card['card'][index] == 'yes':
        credit_card['card'][index] = 1
    else:
        credit_card['card'][index] = 0 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  credit_card['card'][index] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  credit_card['card'][index] = 0


In [132]:
credit_card.head()

Unnamed: 0,card,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,1,0,37.66667,4.52,0.03327,124.9833,yes,no,3,54,1,12
1,1,0,33.25,2.42,0.005217,9.854167,no,no,3,34,1,13
2,1,0,33.66667,4.5,0.004156,15.0,yes,no,4,58,1,5
3,1,0,30.5,2.54,0.065214,137.8692,no,no,0,25,1,7
4,1,0,32.16667,9.7867,0.067051,546.5033,yes,no,2,64,1,5


In [133]:
dummies = pd.get_dummies(credit_card)
dummies.drop(['card_0', 'owner_no', 'selfemp_no'], axis=1, inplace=True)

dummies.rename(columns={'card_1': 'card',
                        'owner_yes': 'owner',
                        'selfemp_yes': 'selfemp'},
               inplace=True)

dummies.head()


  dummies = pd.get_dummies(credit_card)


Unnamed: 0,reports,age,income,share,expenditure,dependents,months,majorcards,active,card,owner,selfemp
0,0,37.66667,4.52,0.03327,124.9833,3,54,1,12,1,1,0
1,0,33.25,2.42,0.005217,9.854167,3,34,1,13,1,0,0
2,0,33.66667,4.5,0.004156,15.0,4,58,1,5,1,1,0
3,0,30.5,2.54,0.065214,137.8692,0,25,1,7,1,0,0
4,0,32.16667,9.7867,0.067051,546.5033,2,64,1,5,1,1,0


In [134]:
X = dummies.drop('card', axis=1)
y = dummies['card']


<h3> Splitting the dataset into train, validation and test sets </h3>

In [135]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                     random_state=22)


In [136]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, 
                                                  random_state=22)

<h3> Evaluating feature importance of numerical variables </h3>

In [137]:
# roc_auc_score(X_train['reports'], y_train)

<h3> Training a Logistic Regression model on the dataset </h3>

In [143]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)

auc_score = roc_auc_score(y_val, y_pred)
auc_score

0.9935064935064934

In [152]:
thresholds = np.linspace(0, 1, 100)

for threshold in thresholds:
    

array([0.        , 0.01010101, 0.02020202, 0.03030303, 0.04040404,
       0.05050505, 0.06060606, 0.07070707, 0.08080808, 0.09090909,
       0.1010101 , 0.11111111, 0.12121212, 0.13131313, 0.14141414,
       0.15151515, 0.16161616, 0.17171717, 0.18181818, 0.19191919,
       0.2020202 , 0.21212121, 0.22222222, 0.23232323, 0.24242424,
       0.25252525, 0.26262626, 0.27272727, 0.28282828, 0.29292929,
       0.3030303 , 0.31313131, 0.32323232, 0.33333333, 0.34343434,
       0.35353535, 0.36363636, 0.37373737, 0.38383838, 0.39393939,
       0.4040404 , 0.41414141, 0.42424242, 0.43434343, 0.44444444,
       0.45454545, 0.46464646, 0.47474747, 0.48484848, 0.49494949,
       0.50505051, 0.51515152, 0.52525253, 0.53535354, 0.54545455,
       0.55555556, 0.56565657, 0.57575758, 0.58585859, 0.5959596 ,
       0.60606061, 0.61616162, 0.62626263, 0.63636364, 0.64646465,
       0.65656566, 0.66666667, 0.67676768, 0.68686869, 0.6969697 ,
       0.70707071, 0.71717172, 0.72727273, 0.73737374, 0.74747