In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('creditcard.csv')

In [3]:
data

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [4]:
data.drop('Time',axis=1,inplace=True)

# let's get the total number of samples for each category

In [5]:
data[data['Class'] == 0].shape

(284315, 30)

In [6]:
data[data['Class'] == 1].shape

(492, 30)

In [7]:
# this function will return total unique values from each column
def unique_value_count(data):
    unique_values = dict()
    for column in data.columns:
        unique_values[column] = len(data[column].unique())
    
    return unique_values

In [8]:
unique_value_count(data)

{'V1': 275663,
 'V2': 275663,
 'V3': 275663,
 'V4': 275663,
 'V5': 275663,
 'V6': 275663,
 'V7': 275663,
 'V8': 275663,
 'V9': 275663,
 'V10': 275663,
 'V11': 275663,
 'V12': 275663,
 'V13': 275663,
 'V14': 275663,
 'V15': 275663,
 'V16': 275663,
 'V17': 275663,
 'V18': 275663,
 'V19': 275663,
 'V20': 275663,
 'V21': 275663,
 'V22': 275663,
 'V23': 275663,
 'V24': 275663,
 'V25': 275663,
 'V26': 275663,
 'V27': 275663,
 'V28': 275663,
 'Amount': 32767,
 'Class': 2}

In [9]:
# data['Time'].unique

In [10]:
# data.isna().sum()

In [11]:
# This is strange as the unique values are coming out to be same but lesser than total values of column

In [12]:
(data == 0).sum(axis=1).unique() # checking if any row has all the values as zeros

array([1, 2, 0], dtype=int64)

In [13]:
duplicate_rows = data[data.duplicated()]
duplicate_rows.shape[0] # getting total number of duplicated rows

9144

In [14]:
print(data.duplicated().sum())   # Getting total number of duplicated rows but differently 

9144


In [15]:
print(data.duplicated(keep='first').value_counts())  # Getting total number of duplicated rows but again differently 

False    275663
True       9144
dtype: int64


In [16]:
# dropping the duplicate rows
data.drop_duplicates(inplace=True,keep='first')

In [17]:
data.shape

(275663, 30)

In [18]:
data_label = np.array(data['Class']).reshape(data['Class'].shape[0],1)

In [19]:
data_label

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]], dtype=int64)

In [20]:
print(data_label.shape)

(275663, 1)


# Let's do dimensionality reduction using PCA

In [21]:
from sklearn.decomposition import PCA

In [22]:
PCA_obj = PCA(n_components=16)

In [23]:
new_data_array = PCA_obj.fit_transform(data)

In [24]:
new_data_array.shape

(275663, 16)

In [25]:
data_label

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]], dtype=int64)

In [26]:
new_data = pd.DataFrame(data=new_data_array)

In [27]:
new_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,59.042469,1.264717,-0.953073,-2.559258,1.102586,0.056444,-0.028142,-0.136403,0.371820,-0.194554,-0.670496,1.058637,-0.099136,0.195310,0.202303,-1.517385
1,-87.890544,-1.072583,0.123013,-0.195061,0.439708,0.054045,-0.032411,-0.063365,-0.256451,0.170405,1.707454,-0.889698,-0.526321,0.198785,0.147549,-0.600740
2,288.087806,0.783371,-1.090214,-2.053562,0.092831,-0.987747,0.628993,-0.542591,-1.487522,-0.427180,0.552639,-0.718009,0.274286,0.043461,0.430816,-2.464673
3,32.922736,0.875123,-0.848014,-1.636324,-0.966673,-0.993870,0.492019,-0.506463,-1.338224,0.032873,-0.257829,-0.585210,0.070405,0.189067,-0.386272,0.498132
4,-20.589188,1.249320,0.148723,-1.747384,0.176306,0.026978,-0.397913,0.173007,0.847575,-0.858395,-0.816950,-1.575170,0.183157,0.788469,-0.446539,-0.331035
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275658,-89.815439,12.844695,11.658483,6.108744,-2.488223,7.494091,3.774636,-5.932871,0.985336,-2.287850,-1.181933,-0.680110,-2.287249,-6.460663,-2.430597,0.606362
275659,-65.789451,0.863875,-1.260137,-1.482111,-0.638587,-1.365549,0.257243,-0.353875,0.633605,0.954162,-0.063408,-1.589925,-0.106557,0.428097,-0.011523,-1.281257
275660,-22.699240,-1.871285,0.911788,3.082031,0.276534,-3.505082,2.038178,-0.939223,0.446143,0.534334,0.398815,0.118360,-0.110257,0.446304,0.049100,-1.483543
275661,-80.579461,0.346109,0.117191,-0.916490,0.606581,0.259812,0.893477,-0.553344,0.366577,0.509038,-1.997447,1.246786,0.162977,-0.510440,0.894472,-1.881723


In [28]:
new_data['class'] = data_label

In [29]:
new_data['class']

0         0
1         0
2         0
3         0
4         0
         ..
275658    0
275659    0
275660    0
275661    0
275662    0
Name: class, Length: 275663, dtype: int64

In [30]:
(new_data['class'].unique())

array([0, 1], dtype=int64)

# Let's apply SMOTE to balance the dataset

In [31]:
from imblearn.over_sampling import SMOTE

In [32]:
smote_obj = SMOTE(sampling_strategy='minority',k_neighbors=5)

In [33]:
X_oversampled, Y_oversampled = smote_obj.fit_resample(X=new_data.iloc[:,0:new_data.shape[1]-1],y=new_data['class'])

In [34]:
X_oversampled

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,59.042469,1.264717,-0.953073,-2.559258,1.102586,0.056444,-0.028142,-0.136403,0.371820,-0.194554,-0.670496,1.058637,-0.099136,0.195310,0.202303,-1.517385
1,-87.890544,-1.072583,0.123013,-0.195061,0.439708,0.054045,-0.032411,-0.063365,-0.256451,0.170405,1.707454,-0.889698,-0.526321,0.198785,0.147549,-0.600740
2,288.087806,0.783371,-1.090214,-2.053562,0.092831,-0.987747,0.628993,-0.542591,-1.487522,-0.427180,0.552639,-0.718009,0.274286,0.043461,0.430816,-2.464673
3,32.922736,0.875123,-0.848014,-1.636324,-0.966673,-0.993870,0.492019,-0.506463,-1.338224,0.032873,-0.257829,-0.585210,0.070405,0.189067,-0.386272,0.498132
4,-20.589188,1.249320,0.148723,-1.747384,0.176306,0.026978,-0.397913,0.173007,0.847575,-0.858395,-0.816950,-1.575170,0.183157,0.788469,-0.446539,-0.331035
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
550375,-90.570610,1.477773,2.972829,2.295794,1.842033,2.095868,1.459639,-0.939788,-2.264662,4.777811,1.514532,2.561746,3.924947,5.543886,-1.210894,-1.492049
550376,-89.584801,2.725411,9.820452,7.645141,9.277473,6.954777,3.988109,-1.779686,-5.563546,12.333862,9.343189,9.382455,14.380714,19.727503,-7.188007,-2.903885
550377,9.418395,10.395544,11.882407,7.993042,3.238752,11.568084,6.077024,-5.674028,-2.646981,7.192553,1.722300,3.462371,4.802425,6.675856,-2.875237,-2.257954
550378,-89.585047,0.697323,6.011451,4.503779,6.914147,2.721089,-0.656176,-0.204638,-3.118933,6.467053,6.041302,5.822702,8.431408,12.037455,-5.025189,-0.759572


In [35]:
balanced_data = pd.DataFrame(X_oversampled)
balanced_data['class'] = Y_oversampled

In [36]:
balanced_data[balanced_data['class'] == 0].shape

(275190, 17)

In [37]:
balanced_data[balanced_data['class']==0].shape

(275190, 17)

# splitting the dataset into training, Cross-Validation and testing data

In [38]:
training_data_len = int(0.7*balanced_data.shape[0])
print(training_data_len)

cv_data_len = int(0.2*balanced_data.shape[0])
print(cv_data_len)

test_data_len = int(0.1*balanced_data.shape[0])
print(test_data_len)

385266
110076
55038


In [39]:
train_data_neg = balanced_data[balanced_data['class'] == 0].iloc[0:int(training_data_len//2)]

train_data_pos = balanced_data[balanced_data['class'] == 1].iloc[0:int(training_data_len//2)]

train_data = pd.concat([train_data_pos,train_data_neg])

In [40]:
cv_data_neg = balanced_data[balanced_data['class'] == 0].iloc[int(training_data_len//2):(int(training_data_len//2)+int(cv_data_len//2))]

cv_data_pos = balanced_data[balanced_data['class'] == 1].iloc[int(training_data_len//2):(int(training_data_len//2)+int(cv_data_len//2))]

cv_data = pd.concat([cv_data_neg,cv_data_pos])

In [41]:
test_data_neg = balanced_data[balanced_data['class'] == 0].iloc[(int(training_data_len//2)+int(cv_data_len//2)):]

test_data_pos =  balanced_data[balanced_data['class'] == 1].iloc[(int(training_data_len//2)+int(cv_data_len//2)):]

test_data = pd.concat([test_data_neg,test_data_pos])

In [42]:
train_data.shape

(385266, 17)

In [43]:
cv_data.shape

(110076, 17)

In [44]:
test_data.shape

(55038, 17)

# applying Gaussian Naive Bayes on training data

In [45]:
from sklearn.naive_bayes import GaussianNB

In [46]:
gaussian_nb = GaussianNB()

In [47]:
gaussian_nb.fit(X=np.array(train_data.iloc[:,0:train_data.shape[1]-1]),y=np.array(train_data['class']))

GaussianNB()

# Finally predicting the labels using CV data and calculating the accuracy

In [48]:
predicted_labels_cv = gaussian_nb.predict(X=np.array(cv_data.iloc[:,0:cv_data.shape[1]-1]))

In [49]:
predicted_labels_cv

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [50]:
from sklearn.metrics import classification_report

In [51]:
print(classification_report(y_true=np.array(cv_data['class']),y_pred=predicted_labels_cv))

              precision    recall  f1-score   support

           0       0.87      0.98      0.92     55038
           1       0.97      0.85      0.91     55038

    accuracy                           0.91    110076
   macro avg       0.92      0.91      0.91    110076
weighted avg       0.92      0.91      0.91    110076



In [52]:
test_data.shape

(55038, 17)

In [53]:
predicted_labels_test = gaussian_nb.predict(X=np.array(test_data.iloc[:,0:test_data.shape[1]-1]))

In [54]:
predicted_labels_test

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [55]:
from sklearn.metrics import classification_report

In [56]:
print(classification_report(y_true=np.array(test_data['class']),y_pred=predicted_labels_test))

              precision    recall  f1-score   support

           0       0.86      0.98      0.92     27519
           1       0.98      0.85      0.91     27519

    accuracy                           0.91     55038
   macro avg       0.92      0.91      0.91     55038
weighted avg       0.92      0.91      0.91     55038

