In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
# loading the dataset to a Pandas DataFrame
credit_card_data = pd.read_csv('/content/creditcard.csv')

In [4]:
# first 5 rows of the dataset
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [5]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
20409,31025,-0.724757,0.649594,1.72701,-0.539584,0.889184,-1.034725,1.456742,-0.716376,-0.216859,...,-0.069338,0.250276,-0.281149,0.476644,0.384563,0.324725,-0.442016,-0.363683,0.0,0.0
20410,31026,-2.317953,-0.691477,1.265161,-0.376933,2.573624,-1.27686,-0.633667,0.205119,-0.307739,...,-0.398602,-1.058315,-0.618345,-0.861632,0.328694,0.092234,0.367453,0.006334,0.89,0.0
20411,31026,-1.32728,1.22465,0.796178,1.153083,-0.236519,0.924243,0.310683,0.751678,-0.420871,...,0.042767,0.42956,-0.098114,-0.278733,-0.14958,-0.214172,0.347256,0.151598,82.57,0.0
20412,31027,0.132756,-0.431359,-0.90916,-3.105432,1.878086,2.976224,-0.320295,0.627178,-2.848177,...,-0.068812,0.021002,-0.077043,0.995254,-0.255149,-0.219462,0.167155,0.16619,25.0,0.0
20413,31027,1.257126,0.34487,0.303501,0.694626,-0.385154,-1.07723,0.075644,-0.194959,0.055911,...,-0.289668,-0.842265,0.0,,,,,,,


In [6]:
# dataset informations
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20414 entries, 0 to 20413
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    20414 non-null  int64  
 1   V1      20414 non-null  float64
 2   V2      20414 non-null  float64
 3   V3      20414 non-null  float64
 4   V4      20414 non-null  float64
 5   V5      20414 non-null  float64
 6   V6      20414 non-null  float64
 7   V7      20414 non-null  float64
 8   V8      20414 non-null  float64
 9   V9      20414 non-null  float64
 10  V10     20414 non-null  float64
 11  V11     20414 non-null  float64
 12  V12     20414 non-null  float64
 13  V13     20414 non-null  float64
 14  V14     20414 non-null  float64
 15  V15     20414 non-null  float64
 16  V16     20414 non-null  float64
 17  V17     20414 non-null  float64
 18  V18     20414 non-null  float64
 19  V19     20414 non-null  float64
 20  V20     20414 non-null  float64
 21  V21     20414 non-null  float64
 22

In [7]:
# checking the number of missing values in each column
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [8]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

0.0    20327
1.0       86
Name: Class, dtype: int64

In [9]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [10]:
print(legit.shape)
print(fraud.shape)

(20327, 31)
(86, 31)


In [11]:
# statistical measures of the data
legit.Amount.describe()

count    20327.000000
mean        70.599121
std        205.057935
min          0.000000
25%          5.985000
50%         16.520000
75%         60.000000
max       7879.420000
Name: Amount, dtype: float64

In [12]:
fraud.Amount.describe()

count      86.000000
mean       93.996860
std       260.195133
min         0.000000
25%         1.000000
50%         1.000000
75%        99.990000
max      1809.680000
Name: Amount, dtype: float64

In [13]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,15872.041226,-0.208046,0.208776,0.80216,0.248245,-0.139011,0.103222,-0.106625,0.001945,0.631723,...,0.035034,-0.047973,-0.142292,-0.037083,0.011362,0.12311,0.030792,0.010198,0.007013,70.599121
1.0,17592.162791,-8.792044,6.504115,-12.461366,6.302522,-6.112129,-2.534417,-8.479521,4.438385,-3.005413,...,0.704824,0.536947,-0.38682,-0.379114,-0.264854,0.349233,0.177437,0.867258,0.09651,93.99686


In [14]:
legit_sample = legit.sample(n=492)

In [15]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [16]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
14116,25114,1.090593,-0.054415,1.214231,1.844288,-0.427767,0.866735,-0.592573,0.200233,2.494517,...,-0.579125,-0.941481,0.038564,-0.327459,0.461936,-0.528445,0.065178,0.021236,19.89,0.0
16663,28026,-0.278946,0.865124,-0.301184,-1.117095,2.541115,3.271293,0.118388,0.934556,-0.504031,...,-0.320307,-1.005699,-0.054508,0.935742,-0.0074,0.088096,0.247904,0.090867,1.29,0.0
7712,10725,-1.224805,-0.021411,2.780357,-0.795733,0.264887,0.691653,-0.157924,0.330413,1.575519,...,0.028755,0.517996,-0.141958,-0.272086,0.071844,1.026869,0.035754,-0.11817,43.8,0.0
3716,3195,0.892562,-0.564435,-0.805989,-0.042736,0.102817,-0.499643,0.656274,-0.283278,-0.451872,...,0.093101,-0.24185,-0.415352,-0.4144,0.568084,1.079856,-0.159159,0.013578,239.4,0.0
6540,7922,0.992578,-1.15679,1.708749,-0.814558,-1.521863,1.305622,-1.694781,0.622689,4.18185,...,0.044327,0.949345,-0.13688,-0.224141,0.336056,0.258748,0.094562,0.010383,38.5,0.0


In [17]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
18466,29526,1.102804,2.829168,-3.93287,4.707691,2.937967,-1.800904,1.672734,-0.30024,-2.783011,...,-0.106994,-0.25005,-0.521627,-0.44895,1.291646,0.516327,0.009146,0.153318,0.68,1.0
18472,29531,-1.060676,2.608579,-2.971679,4.360089,3.738853,-2.728395,1.987616,-0.357345,-2.757535,...,-0.063168,-0.207385,-0.183261,-0.103679,0.896178,0.407387,-0.130918,0.192177,0.68,1.0
18773,29753,0.269614,3.549755,-5.810353,5.80937,1.538808,-2.269219,-0.824203,0.35107,-3.759059,...,0.371121,-0.32229,-0.549856,-0.520629,1.37821,0.564714,0.553255,0.4024,0.68,1.0
18809,29785,0.923764,0.344048,-2.880004,1.72168,-3.019565,-0.639736,-3.801325,1.299096,0.864065,...,0.899931,1.481271,0.725266,0.17696,-1.815638,-0.536517,0.489035,-0.049729,30.3,1.0
20198,30852,-2.830984,0.885657,1.19993,2.861292,0.321669,0.289966,1.76776,-2.45105,0.069736,...,0.546589,0.334971,0.172106,0.62359,-0.527114,-0.079215,-2.532445,0.311177,104.81,1.0


In [18]:
new_dataset['Class'].value_counts()

0.0    492
1.0     86
Name: Class, dtype: int64

In [19]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,16153.552846,-0.271187,0.095725,0.795921,0.214868,-0.122856,-0.025427,-0.178292,-0.093479,0.650934,...,0.047095,-0.119422,-0.13659,-0.025927,-0.009647,0.135177,0.055712,0.008397,-0.002526,70.217703
1.0,17592.162791,-8.792044,6.504115,-12.461366,6.302522,-6.112129,-2.534417,-8.479521,4.438385,-3.005413,...,0.704824,0.536947,-0.38682,-0.379114,-0.264854,0.349233,0.177437,0.867258,0.09651,93.99686


Splitting the data into Features & Targets



In [20]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [21]:
print(X)

        Time        V1        V2        V3        V4        V5        V6  \
14116  25114  1.090593 -0.054415  1.214231  1.844288 -0.427767  0.866735   
16663  28026 -0.278946  0.865124 -0.301184 -1.117095  2.541115  3.271293   
7712   10725 -1.224805 -0.021411  2.780357 -0.795733  0.264887  0.691653   
3716    3195  0.892562 -0.564435 -0.805989 -0.042736  0.102817 -0.499643   
6540    7922  0.992578 -1.156790  1.708749 -0.814558 -1.521863  1.305622   
...      ...       ...       ...       ...       ...       ...       ...   
18466  29526  1.102804  2.829168 -3.932870  4.707691  2.937967 -1.800904   
18472  29531 -1.060676  2.608579 -2.971679  4.360089  3.738853 -2.728395   
18773  29753  0.269614  3.549755 -5.810353  5.809370  1.538808 -2.269219   
18809  29785  0.923764  0.344048 -2.880004  1.721680 -3.019565 -0.639736   
20198  30852 -2.830984  0.885657  1.199930  2.861292  0.321669  0.289966   

             V7        V8        V9  ...       V20       V21       V22  \
14116 -0.5925

In [22]:
print(Y)

14116    0.0
16663    0.0
7712     0.0
3716     0.0
6540     0.0
        ... 
18466    1.0
18472    1.0
18773    1.0
18809    1.0
20198    1.0
Name: Class, Length: 578, dtype: float64


Split the data into Training data & Testing Data

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [24]:
print(X.shape, X_train.shape, X_test.shape)

(578, 30) (462, 30) (116, 30)


Model Training
logistic Regression

In [25]:
model = LogisticRegression()

In [26]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Evaluation
Accuracy Score

In [27]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [28]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.987012987012987


In [29]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [30]:
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.9482758620689655
