In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [9]:
# loading dataset

credit_card_data = pd.read_csv('/content/creditcard.csv')

In [10]:
#first 5 rows of the dataset
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [11]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
17913,29027,-0.422159,0.231118,1.666711,0.451976,-0.203598,0.097244,-0.039666,0.354218,0.062463,...,0.110909,0.435121,-0.056658,0.265867,-0.548204,0.734013,0.117023,0.130972,9.0,0.0
17914,29030,1.177387,-0.215585,0.202972,0.215323,-0.029312,0.601788,-0.297021,0.188082,0.43637,...,-0.055842,0.075903,-0.18712,-0.717798,0.555294,0.731531,-0.022112,-0.010929,25.0,0.0
17915,29030,-0.553746,0.880858,1.644821,-0.132657,0.12094,-0.267411,0.466892,0.222443,-0.639624,...,-0.133339,-0.348662,0.029947,0.199962,-0.328384,0.071511,0.275487,0.110195,0.89,0.0
17916,29030,-2.844632,3.71796,-7.165428,4.120419,-2.991039,-2.942326,-4.925187,2.204337,-2.663613,...,0.894495,-0.340246,0.012222,-0.059679,-0.104338,-0.295884,1.326228,0.322688,89.99,0.0
17917,29031,1.050204,0.078269,0.484733,1.349623,,,,,,...,,,,,,,,,,


In [12]:
#dataset information
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17918 entries, 0 to 17917
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    17918 non-null  int64  
 1   V1      17918 non-null  float64
 2   V2      17918 non-null  float64
 3   V3      17918 non-null  float64
 4   V4      17918 non-null  float64
 5   V5      17917 non-null  float64
 6   V6      17917 non-null  float64
 7   V7      17917 non-null  float64
 8   V8      17917 non-null  float64
 9   V9      17917 non-null  float64
 10  V10     17917 non-null  float64
 11  V11     17917 non-null  float64
 12  V12     17917 non-null  float64
 13  V13     17917 non-null  float64
 14  V14     17917 non-null  float64
 15  V15     17917 non-null  float64
 16  V16     17917 non-null  float64
 17  V17     17917 non-null  float64
 18  V18     17917 non-null  float64
 19  V19     17917 non-null  float64
 20  V20     17917 non-null  float64
 21  V21     17917 non-null  float64
 22

In [13]:
#checking number of missing values in each column
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        1
V6        1
V7        1
V8        1
V9        1
V10       1
V11       1
V12       1
V13       1
V14       1
V15       1
V16       1
V17       1
V18       1
V19       1
V20       1
V21       1
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [14]:
#distribution of legir transactions and fraud transactions
credit_card_data['Class'].value_counts()

0.0    17836
1.0       81
Name: Class, dtype: int64

This dataset is very unbalanced.
0 --> normal transaction
1 --> fraud transaction


In [15]:
#seperating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]

fraud = credit_card_data[credit_card_data.Class == 1]

In [16]:
print(legit.shape)
print(fraud.shape)

(17836, 31)
(81, 31)


In [17]:
#statistical measures of the data
legit.Amount.describe()

count    17836.000000
mean        67.365025
std        188.754429
min          0.000000
25%          5.490000
50%         15.950000
75%         56.232500
max       7712.430000
Name: Amount, dtype: float64

In [18]:
fraud.Amount.describe()

count      81.000000
mean       98.105926
std       267.464067
min         0.000000
25%         1.000000
50%         1.000000
75%        99.990000
max      1809.680000
Name: Amount, dtype: float64

In [19]:
#compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,13891.13198,-0.203851,0.22856,0.840631,0.263581,-0.117213,0.112151,-0.110725,-0.008454,0.752065,...,0.032646,-0.05493,-0.144997,-0.036871,0.015838,0.118904,0.035813,0.010352,0.00667,67.365025
1.0,16833.074074,-9.315066,6.779465,-13.05287,6.451318,-6.557541,-2.602613,-9.012856,4.730365,-3.087651,...,0.756424,0.549752,-0.423495,-0.398104,-0.277836,0.355688,0.177616,0.940693,0.090007,98.105926


Under-Sampling

Build a sample dataset containing similar distribution of normal transactions and fraud transactions

Number of fraud transactions - 3

In [25]:
legit_sample = legit.sample(n=81)

concatenating 2 data frames

In [26]:
new_dataset = pd.concat([legit_sample, fraud], axis = 0)

In [27]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
17770,28901,-1.188663,0.364418,2.606518,0.195837,0.000395,1.862473,0.388495,0.681966,-0.106265,...,0.383043,1.152565,0.019238,-0.634809,0.208425,-0.218116,0.121059,0.074171,117.0,0.0
6349,7573,-0.331503,1.104454,2.071696,1.314556,-0.06448,-0.349651,0.689229,-0.295952,0.953198,...,-0.202686,-0.065186,-0.045489,0.303041,-0.324504,-0.447538,0.216746,-0.01372,35.0,0.0
12069,20930,1.227898,-0.370765,1.012093,-0.499147,-1.269944,-0.807357,-0.744718,-0.038101,3.314387,...,-0.172137,-0.090812,-0.016814,0.311199,0.426251,-0.721726,0.055828,0.027806,11.85,0.0
17706,28858,1.355129,-0.71952,1.369979,-0.626986,-1.721096,-0.512495,-1.286656,0.0064,-0.261268,...,0.441149,1.268833,-0.086972,0.45767,0.334936,-0.044747,0.061888,0.034299,9.99,0.0
11902,20502,0.898614,0.033896,-0.083297,1.256326,0.554091,0.702342,0.155668,0.19636,1.160759,...,0.031511,0.278743,-0.027337,-0.708803,0.393032,-0.231545,0.011633,0.005579,89.99,0.0


In [28]:
new_dataset['Class'].value_counts()

0.0    81
1.0    81
Name: Class, dtype: int64

In [29]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,13944.382716,-0.549779,0.147967,1.011621,0.096071,-0.17776,0.104444,-0.2541,0.123268,0.816034,...,-0.043151,-0.097993,-0.138516,-0.048497,-0.009063,0.096425,-0.023015,0.068699,-0.01619,66.053333
1.0,16833.074074,-9.315066,6.779465,-13.05287,6.451318,-6.557541,-2.602613,-9.012856,4.730365,-3.087651,...,0.756424,0.549752,-0.423495,-0.398104,-0.277836,0.355688,0.177616,0.940693,0.090007,98.105926


splitting the data into features and targets

In [30]:
x = new_dataset.drop(columns = 'Class', axis = 1)
y = new_dataset['Class']

In [31]:
print(x)

        Time         V1         V2         V3        V4         V5        V6  \
17770  28901  -1.188663   0.364418   2.606518  0.195837   0.000395  1.862473   
6349    7573  -0.331503   1.104454   2.071696  1.314556  -0.064480 -0.349651   
12069  20930   1.227898  -0.370765   1.012093 -0.499147  -1.269944 -0.807357   
17706  28858   1.355129  -0.719520   1.369979 -0.626986  -1.721096 -0.512495   
11902  20502   0.898614   0.033896  -0.083297  1.256326   0.554091  0.702342   
...      ...        ...        ...        ...       ...        ...       ...   
17317  28625 -27.848181  15.598193 -28.923756  6.418442 -20.346228 -4.828202   
17366  28658 -28.524268  15.876923 -29.468732  6.447591 -20.786000 -4.865613   
17407  28692 -29.200329  16.155701 -30.013712  6.476731 -21.225810 -4.902997   
17453  28726 -29.876366  16.434525 -30.558697  6.505862 -21.665654 -4.940356   
17480  28755 -30.552380  16.713389 -31.103685  6.534984 -22.105532 -4.977692   

              V7         V8        V9  

In [32]:
print(y)

17770    0.0
6349     0.0
12069    0.0
17706    0.0
11902    0.0
        ... 
17317    1.0
17366    1.0
17407    1.0
17453    1.0
17480    1.0
Name: Class, Length: 162, dtype: float64


split the data into training data and testing data

In [33]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y, random_state = 2)

In [34]:
print(x.shape, x_train.shape, x_test.shape)

(162, 30) (129, 30) (33, 30)


Model Training

Logistic Regression

In [35]:
model = LogisticRegression()

In [None]:
#training the logistic regression model with training data

In [36]:
model.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluation

In [37]:
#accuracy on training data
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction, y_train)

In [38]:
print('Accuracy: ', training_data_accuracy)

Accuracy:  0.9922480620155039


In [39]:
#accuracy on test data
x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction, y_test)

In [40]:
print('Accuracy: ', test_data_accuracy)

Accuracy:  1.0
