In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# loading the dataset to a Pandas DataFrame
credit_card_data = pd.read_csv('/content/creditcard.csv')

In [None]:
# first 5 rows of the dataset
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [None]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
67429,52531,-0.668025,0.539737,0.161731,-1.07138,1.529141,3.819435,-0.685869,1.486179,-0.293208,...,0.214492,0.282335,0.07072,1.022712,-0.505535,0.253658,0.017185,0.105271,49.97,0.0
67430,52532,-0.679169,1.237672,1.602476,-0.007797,-0.228123,-1.095514,0.73434,-0.106748,-0.298766,...,-0.233214,-0.517932,0.020122,0.702798,-0.170477,0.054315,0.380961,0.183103,7.13,0.0
67431,52532,-0.102663,0.890146,1.178701,-0.062896,0.289073,-0.377077,0.753827,0.035692,-0.936965,...,-0.292659,-0.897627,0.275655,0.19026,-0.804461,-0.240939,0.028038,0.029944,9.5,0.0
67432,52532,0.797464,-0.947293,1.045822,0.263474,-1.19498,0.348919,-0.594826,0.268371,0.870239,...,-0.039739,-0.211713,-0.023972,0.313647,-0.045046,0.924183,-0.049685,0.031583,170.82,0.0
67433,52534,1.21056,0.291028,0.273319,0.646091,-0.199608,-0.606096,0.000425,-0.056976,0.051479,...,,,,,,,,,,


In [None]:
# dataset informations
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67434 entries, 0 to 67433
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    67434 non-null  int64  
 1   V1      67434 non-null  float64
 2   V2      67434 non-null  float64
 3   V3      67434 non-null  float64
 4   V4      67434 non-null  float64
 5   V5      67434 non-null  float64
 6   V6      67434 non-null  float64
 7   V7      67434 non-null  float64
 8   V8      67434 non-null  float64
 9   V9      67434 non-null  float64
 10  V10     67434 non-null  float64
 11  V11     67434 non-null  float64
 12  V12     67434 non-null  float64
 13  V13     67434 non-null  float64
 14  V14     67434 non-null  float64
 15  V15     67434 non-null  float64
 16  V16     67434 non-null  float64
 17  V17     67434 non-null  float64
 18  V18     67434 non-null  float64
 19  V19     67434 non-null  float64
 20  V20     67434 non-null  float64
 21  V21     67433 non-null  float64
 22

In [None]:
# checking the number of missing values in each column
credit_card_data.isnull().sum()

Unnamed: 0,0
Time,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,0
V8,0
V9,0


In [None]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,67264
1.0,169


In [None]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [None]:
print(legit.shape)
print(fraud.shape)

(67264, 31)
(169, 31)


In [None]:
# statistical measures of the data
legit.Amount.describe()

Unnamed: 0,Amount
count,67264.0
mean,96.632489
std,270.008302
min,0.0
25%,7.68
50%,26.38
75%,88.36
max,19656.53


In [None]:
fraud.Amount.describe()

Unnamed: 0,Amount
count,169.0
mean,94.89568
std,220.959331
min,0.0
25%,1.0
50%,7.61
75%,99.99
max,1809.68


In [None]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,33987.964275,-0.224397,-0.031225,0.70922,0.156146,-0.256028,0.107483,-0.096094,0.048974,0.038459,...,0.046365,-0.030766,-0.106221,-0.03828,0.00583,0.136332,0.020505,0.00137,0.003425,96.632489
1.0,29493.242604,-7.106029,5.115846,-9.566072,5.470948,-5.263861,-2.172211,-7.41255,3.501007,-3.298551,...,0.431494,0.85067,-0.212903,-0.255225,-0.075295,0.245601,0.107931,0.576249,0.041284,94.89568


undersampling

In [None]:
legit_sample = legit.sample(n=169)

In [None]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [None]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
38777,39544,1.354292,-0.451228,0.339052,-0.778482,-0.809645,-0.503752,-0.470948,-0.131071,-1.259308,...,-0.438799,-0.799844,0.068284,0.055278,0.146153,0.901895,-0.053354,0.001952,17.24,0.0
254,179,-0.431092,0.890752,1.720543,-0.238246,0.273263,-0.970191,1.24991,-0.483676,-0.871157,...,-0.412644,-1.112005,-0.057684,0.430112,-0.06325,0.723811,-0.210697,-0.161058,27.99,0.0
9919,14654,0.631254,-0.520331,0.382565,1.832481,-0.655553,-0.698188,0.493155,-0.378315,1.475528,...,-0.073049,-0.382023,-0.307069,0.707786,0.594239,-0.450608,-0.051546,0.068394,299.0,0.0
8927,12259,0.997492,-0.54585,1.21194,0.405636,-1.110319,0.031424,-0.695091,0.069851,2.146552,...,-0.272491,-0.494717,-0.005991,0.373149,0.066766,0.874968,-0.078179,0.01459,98.92,0.0
23313,32692,-0.364387,1.05171,0.546476,-0.056895,-0.043923,-0.528666,0.34706,0.369608,-0.911213,...,0.184717,0.391868,-0.072187,0.041019,-0.208408,0.292262,-0.116589,-0.042876,15.86,0.0


In [None]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
63421,50706,-8.461845,6.866198,-11.838269,4.194211,-6.923097,-3.221147,-7.553497,6.015618,-2.466143,...,0.918244,-0.715366,0.210747,-0.060211,0.509535,-0.257284,1.170027,0.229301,99.99,1.0
63634,50808,-9.16979,7.092197,-12.354037,4.243069,-7.176438,-3.386618,-8.058012,6.442909,-2.412987,...,0.926157,-0.817706,-0.150434,-0.039383,0.48564,-0.264325,1.15969,0.232758,99.99,1.0
64329,51112,-9.848776,7.365546,-12.898538,4.273323,-7.611991,-3.427045,-8.350808,6.863604,-2.387567,...,0.931958,-0.874467,-0.192639,-0.035426,0.538665,-0.263934,1.134095,0.225973,99.99,1.0
64411,51135,-10.527304,7.639745,-13.443115,4.303403,-8.04821,-3.466997,-8.643193,7.284105,-2.362097,...,0.937416,-0.931178,-0.235697,-0.031393,0.591558,-0.263516,1.108897,0.219021,99.99,1.0
64460,51155,-11.205461,7.914633,-13.987752,4.333341,-8.48497,-3.506561,-8.935243,7.704449,-2.336584,...,0.942593,-0.987848,-0.279446,-0.027299,0.644344,-0.263078,1.084023,0.211933,99.99,1.0


In [None]:
new_dataset['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,169
1.0,169


In [None]:
new_dataset.groupby('Class').mean()


Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,33493.094675,-0.227603,-0.060796,0.739874,0.170185,-0.177502,0.26315,-0.155307,0.037307,0.123219,...,0.025592,0.050527,-0.09811,-0.012536,-0.116037,0.133633,0.012582,-0.033223,0.03041,86.647633
1.0,29493.242604,-7.106029,5.115846,-9.566072,5.470948,-5.263861,-2.172211,-7.41255,3.501007,-3.298551,...,0.431494,0.85067,-0.212903,-0.255225,-0.075295,0.245601,0.107931,0.576249,0.041284,94.89568


splitting the data into features and targets


In [None]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']


In [None]:
print(X)

        Time         V1        V2         V3        V4        V5        V6  \
38777  39544   1.354292 -0.451228   0.339052 -0.778482 -0.809645 -0.503752   
254      179  -0.431092  0.890752   1.720543 -0.238246  0.273263 -0.970191   
9919   14654   0.631254 -0.520331   0.382565  1.832481 -0.655553 -0.698188   
8927   12259   0.997492 -0.545850   1.211940  0.405636 -1.110319  0.031424   
23313  32692  -0.364387  1.051710   0.546476 -0.056895 -0.043923 -0.528666   
...      ...        ...       ...        ...       ...       ...       ...   
63421  50706  -8.461845  6.866198 -11.838269  4.194211 -6.923097 -3.221147   
63634  50808  -9.169790  7.092197 -12.354037  4.243069 -7.176438 -3.386618   
64329  51112  -9.848776  7.365546 -12.898538  4.273323 -7.611991 -3.427045   
64411  51135 -10.527304  7.639745 -13.443115  4.303403 -8.048210 -3.466997   
64460  51155 -11.205461  7.914633 -13.987752  4.333341 -8.484970 -3.506561   

             V7        V8        V9  ...       V20       V21   

In [None]:
print(Y)

38777    0.0
254      0.0
9919     0.0
8927     0.0
23313    0.0
        ... 
63421    1.0
63634    1.0
64329    1.0
64411    1.0
64460    1.0
Name: Class, Length: 338, dtype: float64


splitting the data into training data and testing data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)


In [None]:
print(X.shape, X_train.shape, X_test.shape)

(338, 30) (270, 30) (68, 30)


model training :

Logistic Regression


In [None]:
model = LogisticRegression()


In [None]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#accuracy on trainig data

In [None]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9777777777777777


In [None]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.9264705882352942
