In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
credit_card_data = pd.read_csv('creditcard.csv')

In [3]:
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
239193,150000.0,-1.662279,-0.278422,2.677875,1.479724,-0.641821,0.42101,-1.162016,0.946243,0.705521,...,0.318863,0.74794,-0.328271,-0.004766,0.429288,-0.220109,0.057,-0.105868,37.9,0
239194,150000.0,-2.371115,-1.837946,1.914117,0.120125,2.569787,-1.288679,-1.257192,0.515161,0.154637,...,-0.153341,-1.189856,0.301846,0.679734,0.370517,-0.596966,-0.034076,0.115195,0.76,0
239195,150000.0,-0.048322,0.324127,-0.638039,-1.128052,1.701661,-0.517981,1.541548,-0.343846,-0.044401,...,0.17698,0.637618,-0.01944,0.282531,-0.679045,-0.588653,0.0327,0.029876,47.99,0
239196,150000.0,-0.405621,1.32232,-1.144053,0.687555,1.226498,-0.715639,1.159412,-0.069677,-0.81489,...,0.28236,1.038318,0.020711,-0.562886,-0.749906,-0.542224,-0.021286,0.036784,2.0,0
239197,150000.0,0.332569,0.786062,-1.099871,-1.216381,1.473755,-0.711839,1.367035,-0.209808,-0.547023,...,0.212748,0.578925,-0.336292,0.220293,0.246645,0.149659,-0.058769,-0.03738,0.76,0


In [5]:
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239198 entries, 0 to 239197
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    239198 non-null  float64
 1   V1      239198 non-null  float64
 2   V2      239198 non-null  float64
 3   V3      239198 non-null  float64
 4   V4      239198 non-null  float64
 5   V5      239198 non-null  float64
 6   V6      239198 non-null  float64
 7   V7      239198 non-null  float64
 8   V8      239198 non-null  float64
 9   V9      239198 non-null  float64
 10  V10     239198 non-null  float64
 11  V11     239198 non-null  float64
 12  V12     239198 non-null  float64
 13  V13     239198 non-null  float64
 14  V14     239198 non-null  float64
 15  V15     239198 non-null  float64
 16  V16     239198 non-null  float64
 17  V17     239198 non-null  float64
 18  V18     239198 non-null  float64
 19  V19     239198 non-null  float64
 20  V20     239198 non-null  float64
 21  V21     23

In [6]:
#checking the number of missing values in each column
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [7]:
#distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

Class
0    238763
1       435
Name: count, dtype: int64

This dataset is highly unbalanced

0 --> Normal Transaction

1 --> Fraudulent Transaction

In [8]:
#separating the data for analysis
legit = credit_card_data[credit_card_data.Class ==0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [9]:
print(legit.shape)
print(fraud.shape)

(238763, 31)
(435, 31)


In [10]:
#statistical measures of the data
legit.Amount.describe()

count    238763.000000
mean         91.061508
std         252.266117
min           0.000000
25%           6.010000
50%          23.450000
75%          79.990000
max       19656.530000
Name: Amount, dtype: float64

In [11]:
fraud.Amount.describe()

count     435.000000
mean      123.335264
std       255.966193
min         0.000000
25%         1.000000
50%         9.990000
75%       105.940000
max      2125.870000
Name: Amount, dtype: float64

In [12]:
#comparing the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,82318.52233,-0.044714,-0.025312,0.151066,0.027272,-0.042857,0.025967,-0.008567,0.000935,0.006668,...,0.006566,-0.005418,-0.016596,-0.008194,0.000766,0.029069,0.002189,-0.000762,0.001163,91.061508
1,70647.14023,-5.172668,3.892631,-7.346585,4.651379,-3.59386,-1.364613,-6.060351,0.640242,-2.676545,...,0.375403,0.765605,0.005915,-0.053739,-0.084397,0.048856,0.037491,0.203174,0.063617,123.335264


Under-Sampling

Build a sample dataset containing similar distribution of normal and fraudulent transactions

Number of Fraudulent Transactions --> 159

In [13]:
legit_sample = legit.sample(n=159)

Concatenating two dataframes

In [14]:
new_dataset = pd.concat([legit_sample, fraud],axis=0)

In [15]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
65540,51633.0,-1.245591,0.194563,2.281026,-0.74969,-0.94273,-0.094855,-0.555378,0.132377,-0.512419,...,0.297773,1.209706,-0.400564,0.484212,0.4051,0.138958,0.228357,0.239752,19.83,0
200604,133500.0,1.648702,-1.481076,-0.56853,-0.417106,-1.140768,-0.087137,-0.685723,0.114739,2.279173,...,-0.17192,-0.67911,0.231501,0.67553,-0.612134,0.359431,-0.064905,-0.010103,200.0,0
185752,126810.0,-0.227307,1.130996,-0.496399,-0.231291,0.886857,-1.288377,1.287154,-0.5814,0.262014,...,0.19632,1.121593,-0.239587,-0.046792,-0.287284,-0.174286,0.226001,0.054497,35.39,0
133500,80416.0,-0.66897,-0.056752,1.747537,-1.694088,-0.071519,-0.69529,0.308368,-0.134146,-1.067425,...,-0.454576,-0.714145,0.0197,0.061493,-0.334785,0.779431,0.095418,-0.062195,17.24,0
124241,77229.0,1.29773,-0.423365,-0.133205,-0.75705,-0.468609,-0.707011,-0.130941,-0.177189,-1.516212,...,0.305436,0.780558,-0.152098,0.283227,0.671592,-0.089814,-0.017255,-0.003857,40.0,0


In [16]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
237107,149096.0,1.184891,3.152084,-6.13478,5.531252,1.733867,-1.816861,-0.916696,0.265568,-3.158014,...,0.124236,-0.823865,-0.079887,0.028828,0.389711,0.060171,0.485187,0.326552,0.0,1
237426,149236.0,-1.370976,-0.025465,-2.774907,2.65053,4.511309,-3.289344,-0.118841,-0.014279,-0.932773,...,-0.048061,-0.59935,0.072193,-0.600351,0.371331,-0.370951,0.01197,0.145895,1.0,1
238222,149582.0,-4.280584,1.4211,-3.908229,2.942946,-0.076205,-2.002526,-2.874155,-0.856005,0.963674,...,-0.140062,-0.90772,-0.680108,-0.34917,0.056276,-1.149923,-1.809886,0.723051,1.1,1
238366,149640.0,0.754316,2.379822,-5.137274,3.818392,0.043203,-1.285451,-1.766684,0.756711,-1.765722,...,0.397058,0.141165,0.171985,0.394274,-0.444642,-0.263189,0.304703,-0.044362,2.0,1
238466,149676.0,1.833191,0.745333,-1.133009,3.893556,0.858164,0.910235,-0.4982,0.344703,-0.667939,...,0.039289,0.181652,0.072981,-0.155299,-0.149891,0.012792,0.040854,0.022903,17.39,1


In [17]:
new_dataset['Class'].value_counts()

Class
1    435
0    159
Name: count, dtype: int64

In [18]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,84334.773585,-0.011063,0.210726,0.084381,0.202162,-0.090018,0.008146,0.060924,-0.124785,0.057545,...,0.079913,-0.017572,-0.023115,0.00406,0.059812,0.073578,-0.017228,0.028817,0.021438,92.59434
1,70647.14023,-5.172668,3.892631,-7.346585,4.651379,-3.59386,-1.364613,-6.060351,0.640242,-2.676545,...,0.375403,0.765605,0.005915,-0.053739,-0.084397,0.048856,0.037491,0.203174,0.063617,123.335264


Splitting the data into Features and Targets

In [19]:
X = new_dataset.drop(columns='Class', axis=1)
Y= new_dataset['Class']

In [20]:
print(X)

            Time        V1        V2        V3        V4        V5        V6  \
65540    51633.0 -1.245591  0.194563  2.281026 -0.749690 -0.942730 -0.094855   
200604  133500.0  1.648702 -1.481076 -0.568530 -0.417106 -1.140768 -0.087137   
185752  126810.0 -0.227307  1.130996 -0.496399 -0.231291  0.886857 -1.288377   
133500   80416.0 -0.668970 -0.056752  1.747537 -1.694088 -0.071519 -0.695290   
124241   77229.0  1.297730 -0.423365 -0.133205 -0.757050 -0.468609 -0.707011   
...          ...       ...       ...       ...       ...       ...       ...   
237107  149096.0  1.184891  3.152084 -6.134780  5.531252  1.733867 -1.816861   
237426  149236.0 -1.370976 -0.025465 -2.774907  2.650530  4.511309 -3.289344   
238222  149582.0 -4.280584  1.421100 -3.908229  2.942946 -0.076205 -2.002526   
238366  149640.0  0.754316  2.379822 -5.137274  3.818392  0.043203 -1.285451   
238466  149676.0  1.833191  0.745333 -1.133009  3.893556  0.858164  0.910235   

              V7        V8        V9  .

In [21]:
print(Y)

65540     0
200604    0
185752    0
133500    0
124241    0
         ..
237107    1
237426    1
238222    1
238366    1
238466    1
Name: Class, Length: 594, dtype: int64


Split the data into Training data & Testing data

In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [23]:
print(X.shape, X_train.shape, X_test.shape)

(594, 30) (475, 30) (119, 30)


Model Training

Logistic Regression

In [33]:
model = LogisticRegression(solver='liblinear', max_iter=1000)

In [34]:
model.fit(X_train, Y_train)

Model Evaluation

Accuracy Score

In [35]:
#accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [36]:
print('Accuracy on Training Data: ', training_data_accuracy)

Accuracy on Training Data:  0.9326315789473684


In [37]:
#accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [38]:
print('Accuracy on Test Data: ', test_data_accuracy)

Accuracy on Test Data:  0.9327731092436975
