In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

###visualizing the data

In [None]:
# loading the dataset to a Pandas DataFrame
credit_card_data = pd.read_csv('/content/credit_data.csv')

In [None]:
# first 5 rows of the dataset
credit_card_data.head()

In [None]:
credit_card_data.tail()

In [None]:
# dataset informations
credit_card_data.info()

In [None]:
# checking the number of missing values in each column
credit_card_data.isnull().sum()

In [None]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

This Dataset is highly unblanced

###Label encoding of data

We generalize the data as:
0 for  Normal Transaction
1 for fraudulent transaction

In [None]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [None]:
print(legit.shape)
print(fraud.shape)

(284315, 31)
(492, 31)


##Finding legit and fraudulant transactions

In [None]:
# Find all the statistical values
legit.Amount.describe()

count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

In [None]:
fraud.Amount.describe()

count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

In [None]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,0.009824,-0.006576,0.010832,0.000189,0.012064,0.000161,0.007164,0.011535,0.003887,-0.001178,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,-5.676883,3.800173,-6.259393,-0.109334,-6.971723,-0.092929,-4.139946,-6.665836,-2.246308,0.680659,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


Under-Sampling

Build a sample dataset containing similar distribution of normal transactions and Fraudulent Transactions

Number of Fraudulent Transactions --> 492

In [None]:
legit_sample = legit.sample(n=492)

Concatenating two DataFrames

In [None]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [None]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
203131,134666.0,-1.22022,-1.729458,-1.118957,-0.266099,0.823338,-0.098556,-0.407751,0.56301,-1.00779,0.261245,-0.841608,-0.041129,-0.628463,0.742288,-1.038836,-2.133763,0.200161,2.26439,0.79125,0.140809,0.237283,0.487028,0.286055,-0.119733,-0.909162,-0.11702,0.173995,-0.023852,155.0,0
95383,65279.0,-1.295124,0.157326,1.544771,-2.468209,-1.683113,-0.623764,-0.371798,0.505656,-2.243475,0.856381,-0.402158,-1.396842,-0.756093,0.014161,0.424519,-0.335512,0.863702,-0.542891,-1.189703,-0.277333,-0.415322,-0.894639,0.126543,0.296285,0.132186,-0.524334,0.317321,0.105345,70.0,0
99706,67246.0,-1.481168,1.22649,1.85755,2.980777,-0.672645,0.581449,-0.143172,0.302713,-0.62467,1.452271,0.940775,0.778863,0.423377,-0.291527,-0.439764,-0.173737,0.072368,0.575807,1.107665,0.075344,0.255337,0.948105,-0.186967,0.590834,-0.499863,0.203458,-0.546577,0.076538,40.14,0
153895,100541.0,-0.181013,1.395877,1.204669,4.349279,1.330126,1.27752,1.568221,-0.633374,-0.860482,1.483849,-0.040592,-3.117997,2.814195,1.224039,0.074473,-0.316746,0.485181,-0.058099,1.263988,0.334418,-0.456328,-0.687657,-0.049974,0.191566,-0.558483,-0.029382,-0.229857,-0.329608,137.04,0
249976,154664.0,0.475977,-0.573662,0.48052,-2.524647,-0.616284,-0.361317,-0.347861,-0.108238,-1.876507,0.871271,-1.201188,-0.741241,1.189017,-0.811912,-0.605718,-0.435814,0.234379,0.052987,-0.36286,-0.310337,-0.092778,0.187082,0.062234,0.653392,-0.399247,-0.28199,0.058961,0.012816,19.6,0


In [None]:
new_dataset.tail()

In [None]:
new_dataset['Class'].value_counts()

1    492
0    492
Name: Class, dtype: int64

In [None]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
0,96783.638211,-0.053037,0.05515,-0.036786,-0.046439,0.077614,-0.023218,-0.000703,-0.05762,-0.053438,0.006904,0.003593,-0.013208,0.020052,0.081527,-0.044844,0.028877,0.006312,0.009006,-0.020251,0.060121,0.017306,0.024803,-0.002469,0.036235,-0.061546,0.005988,-0.02793,0.004996,91.477053
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,-5.676883,3.800173,-6.259393,-0.109334,-6.971723,-0.092929,-4.139946,-6.665836,-2.246308,0.680659,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


Splitting the data into Features & Targets

In [None]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [None]:
print(X)

In [None]:
print(Y)

Split the data into Training data & Testing Data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(984, 30) (787, 30) (197, 30)


###Training the model by Logistic Regression

In [None]:
model = LogisticRegression()

In [None]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

##Accuracy Score

In [None]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9415501905972046


In [None]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.9390862944162437


as the accuracy on test , train data are similiar the near near about same