# Data Preprocessing

In [2]:
# read in libraries
try:
    import pandas as pd
    import numpy as np
    import statistics

    from imblearn.over_sampling import SMOTE
except:
    !pip install -r requirements.txt

In [3]:
# read in data
data = pd.read_csv('creditcard.csv')

In [5]:
# testing to see the count of fraud transactions
isFraudData = data[data['Class'] == 1]
isFraudData.groupby(['Class'])['Class'].count()

Class
1    492
Name: Class, dtype: int64

In [6]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=440)
# Total: 284,807 counts, 172,792 seconds (~ 48 hours)
# Train: 153,942 counts, 100,792 seconds(~ 28 hours)
# Test 1: 53,744 counts, 36,000 seconds (10 hours)
# Test 2: 77,121 counts, 36,000 seconds(10 hours)
# Noise: 65,432 counts (avg of Test 1 and Test 2)
train_end_index = int(data[data['Time']==100774].index.values) #153941
test1_end_index = data[data['Time']==136792].index.values[-1] #207685 - 153941 = 53744

In [7]:
# y is fraud or not. X is  everything else 
X = data.drop(columns = ['Class', 'Time'])
y = data['Class']

In [8]:
# segmenting train and test data for 3 test batches
X_train = X[0:train_end_index + 1]
X_test1 = X[train_end_index + 1:test1_end_index + 1]
X_test2 = X[test1_end_index + 1:]
y_train = y[0:train_end_index + 1]
y_test1 = y[train_end_index + 1:test1_end_index + 1]
y_test2 = y[test1_end_index + 1:]

In [9]:
# creating randomly sampled test batch with added noise (10%)
test = data[train_end_index + 1:]
avg_test_length = round(statistics.mean([len(X_test1), len(X_test2)]))
noise = test.sample(random_state = 123, n = avg_test_length, replace = True)
# #adding noise
noise_mod = np.random.binomial(1, 0.1, noise.shape[0])
noise['Class'] = abs(np.subtract(noise['Class'].values, noise_mod))

In [10]:
X_noise = noise.drop(columns = ["Class", 'Time'])
y_noise = noise['Class']

In [11]:
# fraud counts in train data BEFORE oversampling
y_train.groupby(y_train).size()

Class
0    153609
1       333
Name: Class, dtype: int64

In [12]:
# oversampling training data
sm = SMOTE(random_state = 123, sampling_strategy=0.3) # oversamples to 3:10 ratio
X_train, y_train = sm.fit_resample(X_train, y_train)

In [13]:
# fraud counts in train data AFTER oversampling
y_train.groupby(y_train).size()

Class
0    153609
1     46082
Name: Class, dtype: int64

In [14]:
# fraud counts in Test 1
y_test1.groupby(y_test1).size()

Class
0    53684
1       60
Name: Class, dtype: int64

In [15]:
# fraud counts in Test 2
y_test2.groupby(y_test2).size()

Class
0    77022
1       99
Name: Class, dtype: int64

In [16]:
# save new train sets and test sets in csv files
X_train.to_csv('X_train.csv', index=False)
y_train.to_csv('y_train.csv', index=False)

X_test1.to_csv('X1_test.csv', index=False)
y_test1.to_csv('y1_test.csv', index=False)

X_test2.to_csv('X2_test.csv', index=False)
y_test2.to_csv('y2_test.csv', index=False)

X_noise.to_csv('X_noise.csv', index=False)
y_noise.to_csv('y_noise.csv', index=False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=9dd4b1f4-9396-4609-8756-e5fddb45c25f' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>