In [24]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler

In [25]:
# Load the data
train_df = pd.read_csv('../Data/fraudTrain.csv')
test_df = pd.read_csv('../Data/fraudTest.csv')

train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

train_df.groupby('is_fraud', group_keys=False).apply(lambda x: x.sample(frac=.5))
test_df.groupby('is_fraud', group_keys=False).apply(lambda x: x.sample(frac=.5))

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
520203,520203,2020-12-23 20:10:46,3560797065840735,fraud_Kozey-Kuhlman,personal_care,12.84,Janet,Turner,F,0925 Lang Extensions,...,46.1838,-101.2589,77,Film/video editor,1989-12-17,28b26d872e4f6bca171efcbec6e00a87,1387829446,45.941629,-100.818558,0
377618,377618,2020-11-15 19:32:05,4939976756738216,fraud_Marvin-Lind,personal_care,21.94,Michelle,Johnston,F,3531 Hamilton Highway,...,26.4215,-99.0025,18128,IT trainer,1990-11-07,1bd154562afe8dcb08eb355ab20e98aa,1384543925,27.386636,-99.006909,0
493449,493449,2020-12-18 08:43:58,3533742182628021,fraud_Cassin-Harvey,grocery_net,38.05,Robert,Haynes,M,857 Aaron Circles Suite 398,...,32.8357,-79.8217,20478,Materials engineer,1997-06-04,764af265614b89277a14c1fe1f240c9a,1387356238,33.239137,-79.013882,0
454884,454884,2020-12-09 07:23:09,213155997615567,"fraud_Reilly, Heaney and Cole",gas_transport,76.61,James,Yoder,M,83359 Lopez Point,...,40.5503,-79.3237,168,Race relations officer,1978-10-04,c0f941884282774cf7525079c4e4276e,1386573789,41.386787,-79.526741,0
83152,83152,2020-07-20 09:00:18,3543591270174051,fraud_Bins-Rice,gas_transport,91.24,Margaret,Lam,F,6911 Nicholas Keys Apt. 237,...,40.4603,-79.0097,922,Early years teacher,1972-10-04,45fee0c008d5b9ff213c8baa402e8fb5,1374310818,40.705473,-79.980537,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327598,327598,2020-10-25 00:36:53,180046617132290,fraud_Ledner-Pfannerstill,gas_transport,10.00,Erika,Mason,F,083 Karen Island Apt. 656,...,37.5750,-88.9655,3119,"Teacher, secondary school",1942-04-17,102ac9a6057fe07d6d0a90ea519e16b4,1382661413,38.297067,-89.064806,1
111579,111579,2020-07-30 00:06:13,180011453250192,"fraud_Greenholt, Jacobi and Gleason",gas_transport,11.21,Craig,Dunn,M,721 Jacqueline Brooks,...,41.2153,-90.9879,1504,Manufacturing engineer,1993-10-05,eae5b35f33b8ba5895a691925baa8b77,1375142773,42.088340,-90.304455,1
461105,461105,2020-12-11 03:51:57,4560395181594436016,"fraud_Yost, Block and Koepp",misc_pos,8.33,Angela,Taylor,F,6343 Ramirez Skyway Apt. 518,...,39.0470,-122.9328,11256,Podiatrist,1972-10-18,858f16a84b02a891aa7e790a1f134a56,1386733917,38.107090,-121.933423,1
400054,400054,2020-11-24 23:11:45,4972228199573984,"fraud_Kihn, Abernathy and Douglas",shopping_net,1082.10,Brian,Hogan,M,3316 Cindy Land,...,34.2691,-95.9685,861,Quantity surveyor,1960-04-08,8ebf7f1820fd92be12c503a3adf85457,1385334705,33.505052,-96.859338,1


In [26]:
# Drop the columns that are not needed
columns_to_drop = ['first', 'last', 'street', 'city', 'state', 'job', 'trans_num', 'dob', 'city_pop', 'unix_time']

print('Columns dropped: ', (columns_to_drop))

train_df = train_df.drop(columns_to_drop, axis=1)
test_df = test_df.drop(columns_to_drop, axis=1)

Columns dropped:  ['first', 'last', 'street', 'city', 'state', 'job', 'trans_num', 'dob', 'city_pop', 'unix_time']


In [27]:
# Fix the date column

# Convert the date-time column to a datetime object
train_df['date'] = pd.to_datetime(train_df['trans_date_trans_time'])

# Extract the parts of the date
train_df['year'] = train_df['date'].dt.year
train_df['month'] = train_df['date'].dt.month
train_df['day'] = train_df['date'].dt.day
train_df['hour'] = train_df['date'].dt.hour
train_df['minute'] = train_df['date'].dt.minute
train_df['second'] = train_df['date'].dt.second

# Now you can drop the original date-time column
train_df = train_df.drop(['date', 'trans_date_trans_time'], axis=1)

# Convert the date-time column to a datetime object
test_df['date'] = pd.to_datetime(test_df['trans_date_trans_time'])

# Extract the parts of the date
test_df['year'] = test_df['date'].dt.year
test_df['month'] = test_df['date'].dt.month
test_df['day'] = test_df['date'].dt.day
test_df['hour'] = test_df['date'].dt.hour
test_df['minute'] = test_df['date'].dt.minute
test_df['second'] = test_df['date'].dt.second

# Now you can drop the original date-time column
test_df = test_df.drop(['date', 'trans_date_trans_time'], axis=1)

In [28]:
# Convert the categorical columns to numerical values

train_df['merchant'] = train_df['merchant'].astype('category')
train_df['merchant'] = train_df['merchant'].cat.codes

test_df['merchant'] = test_df['merchant'].astype('category')
test_df['merchant'] = test_df['merchant'].cat.codes

train_df['category'] = train_df['category'].astype('category')
train_df['category'] = train_df['category'].cat.codes

test_df['category'] = test_df['category'].astype('category')
test_df['category'] = test_df['category'].cat.codes


train_df['gender'].replace(['F', 'M'], [0, 1], inplace=True)
test_df['gender'].replace(['F', 'M'], [0, 1], inplace=True)

# train_df.head().to_csv('../Data/cleanedTrain.csv', index=False)
# test_df.head().to_csv('../Data/cleanedTest.csv', index=False)

In [29]:
# split the data into X and y
# print(len(test_df))

# test_df = test_df.sample(frac=.5)

# print(len(test_df))

X_train = train_df.drop(['is_fraud'], axis=1)
y_train = train_df['is_fraud']
X_test = test_df.drop(['is_fraud'], axis=1)
y_test = test_df['is_fraud']

In [30]:
X_train.head()

Unnamed: 0.1,Unnamed: 0,cc_num,merchant,category,amt,gender,zip,lat,long,merch_lat,merch_long,year,month,day,hour,minute,second
0,0,2703186189652095,514,8,4.97,0,28654,36.0788,-81.1781,36.011293,-82.048315,2019,1,1,0,0,18
1,1,630423337322,241,4,107.23,0,99160,48.8878,-118.2105,49.159047,-118.186462,2019,1,1,0,0,44
2,2,38859492057661,390,0,220.11,1,83252,42.1808,-112.262,43.150704,-112.154481,2019,1,1,0,0,51
3,3,3534093764340240,360,2,45.0,1,59632,46.2306,-112.1138,47.034331,-112.561071,2019,1,1,0,1,16
4,4,375534208663984,297,9,41.96,1,24433,38.4207,-79.4629,38.674999,-78.632459,2019,1,1,0,3,6


In [31]:
y_train

0          0
1          0
2          0
3          0
4          0
          ..
1296670    0
1296671    0
1296672    0
1296673    0
1296674    0
Name: is_fraud, Length: 1296675, dtype: int64

In [32]:
X_test.head()

Unnamed: 0.1,Unnamed: 0,cc_num,merchant,category,amt,gender,zip,lat,long,merch_lat,merch_long,year,month,day,hour,minute,second
0,0,2291163933867244,319,10,2.86,1,29209,33.9659,-80.9355,33.986391,-81.200714,2020,6,21,12,14,25
1,1,3573030041201292,591,10,29.84,0,84002,40.3207,-110.436,39.450498,-109.960431,2020,6,21,12,14,33
2,2,3598215285024754,611,5,41.28,0,11710,40.6729,-73.5365,40.49581,-74.196111,2020,6,21,12,14,53
3,3,3591919803438423,222,9,60.05,1,32780,28.5697,-80.8191,28.812398,-80.883061,2020,6,21,12,15,15
4,4,3526826139003047,292,13,3.19,1,49632,44.2529,-85.017,44.959148,-85.884734,2020,6,21,12,15,17


In [33]:
y_test

0         0
1         0
2         0
3         0
4         0
         ..
555714    0
555715    0
555716    0
555717    0
555718    0
Name: is_fraud, Length: 555719, dtype: int64

In [34]:
# Create the model
model = KNeighborsClassifier(n_neighbors=1)

model.fit(X_train, y_train)

In [35]:
# Make predictions
y_pred = model.predict(X_test)
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

df

Unnamed: 0,Actual,Predicted
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
555714,0,0
555715,0,0
555716,0,0
555717,0,0


In [37]:
# Metrics 
print('KNN Classifier - K = 3')
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('Mean Absolute Percentage Error:', mean_absolute_percentage_error(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))

KNN Classifier - K = 3
Accuracy: 0.9954131494514313
Precision: 0.028037383177570093
Recall: 0.005594405594405594
F1 Score: 0.009327633113097552
Mean Absolute Error: 0.0045868505485686115
Mean Squared Error: 0.0045868505485686115
Root Mean Squared Error: 0.06772629141307393
Mean Absolute Percentage Error: 3371303563466.6562
Confusion Matrix:
 [[553158    416]
 [  2133     12]]
