In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
credit_card_data = pd.read_csv('creditcard.csv')
credit_card_data.head()

Unnamed: 0.1,Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,62176,50164,-0.2732,0.2498,1.065,0.507,0.004597,0.7617,0.569,-0.004826,...,-0.3298,-0.3242,0.0324,-0.771,-0.1211,-0.203,0.1941,0.1542,99.0,0
1,62177,50164,1.316,-1.491,0.995,-1.315,-1.913,0.10046,-1.547,0.11304,...,0.02327,0.4487,-0.1407,0.0427,0.353,-0.05136,0.06213,0.03003,79.0,0
2,62178,50165,-1.153,0.3528,1.322,0.868,1.217,2.225,1.007,0.062,...,-0.2126,0.7007,-0.03088,-1.334,-0.2646,-0.1803,0.3137,-0.3875,100.2,0
3,62179,50165,-1.706,-0.03983,1.243,1.315,0.11176,-0.3582,0.0519,0.603,...,0.306,0.1451,-0.07544,-0.01602,0.4292,-0.1533,-0.0702,-0.1378,120.0,0
4,62180,50165,1.137,0.1345,0.1968,0.956,-0.1803,-0.4492,0.11017,-0.01642,...,-0.10547,-0.4592,-0.05417,-0.03497,0.4802,-0.525,-0.000886,0.01709,46.3,0


In [3]:
credit_card_data.info(), credit_card_data.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 222795 entries, 0 to 222794
Data columns (total 32 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  222795 non-null  int64  
 1   Time        222795 non-null  int64  
 2   V1          222795 non-null  float64
 3   V2          222795 non-null  float64
 4   V3          222795 non-null  float64
 5   V4          222795 non-null  float64
 6   V5          222795 non-null  float64
 7   V6          222795 non-null  float64
 8   V7          222795 non-null  float64
 9   V8          222795 non-null  float64
 10  V9          222795 non-null  float64
 11  V10         222795 non-null  float64
 12  V11         222795 non-null  float64
 13  V12         222795 non-null  float64
 14  V13         222795 non-null  float64
 15  V14         222795 non-null  float64
 16  V15         222795 non-null  float64
 17  V16         222795 non-null  float64
 18  V17         222795 non-null  float64
 19  V1

(None,
 Unnamed: 0    0
 Time          0
 V1            0
 V2            0
 V3            0
 V4            0
 V5            0
 V6            0
 V7            0
 V8            0
 V9            0
 V10           0
 V11           0
 V12           0
 V13           0
 V14           0
 V15           0
 V16           0
 V17           0
 V18           0
 V19           0
 V20           0
 V21           0
 V22           0
 V23           0
 V24           0
 V25           0
 V26           0
 V27           0
 V28           0
 Amount        0
 Class         0
 dtype: int64)

In [4]:
credit_card_data['Class'].value_counts()

0    222303
1       492
Name: Class, dtype: int64

In [5]:
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [6]:
legit.Amount.describe(), fraud.Amount.describe()

(count    222303.000000
 mean         86.231953
 std         244.451225
 min           0.000000
 25%           5.000000
 50%          20.500000
 75%          74.940000
 max       25700.000000
 Name: Amount, dtype: float64,
 count     492.000000
 mean      122.213171
 std       256.695232
 min         0.000000
 25%         1.000000
 50%         9.250000
 75%       105.900000
 max      2126.000000
 Name: Amount, dtype: float64)

In [7]:
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,173502.241324,112222.03197,0.071889,-0.001645,-0.183186,-0.053691,0.076853,-0.027139,0.038856,-0.014987,...,-0.01402,0.006857,0.029855,0.011036,-0.001498,-0.03797,-0.005691,-0.000544,-0.00135,86.231953
1,119295.058943,80746.806911,-4.771921,3.623865,-7.033345,4.542124,-3.151192,-1.39774,-5.568929,0.570853,...,0.372342,0.713509,0.013992,-0.040305,-0.105133,0.041453,0.051648,0.170607,0.075664,122.213171


In [8]:
legit_sample = legit.sample(n=492)
new_dataset = pd.concat([legit_sample, fraud], axis=0)
new_dataset.head()

Unnamed: 0.1,Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
104380,166750,118287,2.06,0.2217,-1.6455,0.423,0.4875,-0.827,0.2391,-0.272,...,-0.3413,-0.824,0.3464,0.6055,-0.253,0.1677,-0.0531,-0.02794,1.98,0
73016,135290,81187,-0.38,1.115,1.274,0.007614,0.245,-0.505,0.657,0.0454,...,-0.233,-0.546,0.007553,0.0325,-0.2222,0.1004,0.265,0.09546,2.28,0
34025,96256,65688,-4.973,-7.008,1.807,-0.076,5.49,-4.41,-4.71,0.8535,...,1.086,0.9375,1.138,-0.01588,-0.0683,-0.2194,-0.10834,0.2637,33.1,0
152940,215353,140033,0.267,0.1225,-0.2189,-1.962,0.3362,-0.769,0.425,-0.2878,...,0.509,1.376,-0.2236,0.6694,0.01712,-0.04108,-0.1284,-0.003227,15.0,0
75187,137462,82169,-1.519,0.03772,1.524,-1.02,-0.1926,0.3613,-0.2688,0.809,...,-0.1127,-0.3965,-0.01866,-0.7134,-0.00815,0.8765,0.1421,0.00839,68.8,0


In [9]:
new_dataset['Class'].value_counts() 

0    492
1    492
Name: Class, dtype: int64

In [10]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,171689.642276,111129.849593,0.099941,-0.067242,-0.212546,-0.013339,-0.004227,-0.171536,0.131929,-0.05664,...,-0.057924,0.021855,-0.013205,0.077119,0.007,-0.046005,0.005418,0.044907,-0.022164,96.197581
1,119295.058943,80746.806911,-4.771921,3.623865,-7.033345,4.542124,-3.151192,-1.39774,-5.568929,0.570853,...,0.372342,0.713509,0.013992,-0.040305,-0.105133,0.041453,0.051648,0.170607,0.075664,122.213171


In [11]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [12]:
model = LogisticRegression()
model.fit(X_train, Y_train)
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9542566709021602


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.9390862944162437
