In [40]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler

In [41]:
# Load the data
train_df = pd.read_csv('../Data/fraudTrain.csv')
test_df = pd.read_csv('../Data/fraudTest.csv')

train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

train_df.groupby('is_fraud', group_keys=False).apply(lambda x: x.sample(frac=.5))
test_df.groupby('is_fraud', group_keys=False).apply(lambda x: x.sample(frac=.5))

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
245973,245973,2020-09-19 01:08:50,639023984367,"fraud_Goldner, Kovacek and Abbott",grocery_pos,108.73,Destiny,Lowe,F,79472 Stevens Trace Apt. 120,...,44.6087,-74.9732,69,Chief Executive Officer,1991-06-05,e28bfa4c999cbde1f40fdd1540fc2dc4,1379552930,45.051753,-74.908134,0
358239,358239,2020-11-07 11:40:42,4195832462863385,fraud_Stracke-Lemke,grocery_pos,143.57,Victoria,Mcbride,F,0688 Kevin Manor,...,39.3465,-90.9362,1146,Exercise physiologist,1929-04-07,961630a8088f6bcb0f5e4f6b9f4f4304,1383824442,39.773258,-91.648532,0
190811,190811,2020-08-27 02:49:03,345389171551808,fraud_Bailey-Morar,grocery_pos,29.09,Justin,Fowler,M,5569 Phillips Neck Apt. 003,...,33.9215,-89.6782,3451,Financial trader,1984-05-19,adab53b21395078f6e1678895dc07c41,1377571743,33.044638,-88.740993,0
15354,15354,2020-06-26 16:07:20,213126662687660,fraud_Cruickshank-Mills,entertainment,1.67,Christopher,Luna,M,242 Brian Mountain,...,27.5155,-99.4986,248858,Video editor,1971-01-28,25be59f07684d05b8cae2fd822fd56f5,1372262840,27.690353,-100.108082,0
62455,62455,2020-07-13 02:01:26,676173792455,"fraud_Kutch, Hermiston and Farrell",gas_transport,81.92,Brittany,Cox,F,07177 William Dale Apt. 547,...,34.0287,-118.4924,92043,"Civil engineer, contracting",1961-04-25,fe62f03f8706ac202cd0d58df7eb7a2d,1373680886,34.976872,-118.319913,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371943,371943,2020-11-13 01:50:15,3535808924394848,"fraud_Osinski, Ledner and Leuschke",grocery_pos,333.73,Matthew,Myers,M,4936 Michelle Points,...,39.3900,-88.9597,1532,Radio broadcast assistant,1980-01-09,634e53c3c460c15a60a4c28720ded907,1384307415,39.249456,-89.353877,1
105325,105325,2020-07-27 21:20:53,4671727014157745,fraud_Block-Hauck,travel,5.26,Kenneth,Edwards,M,3653 Ryan Crossroad,...,40.8618,-85.6067,2304,Retail banker,1955-07-25,c454f9730fbbe00ce5f2f5376ff29d00,1374960053,40.176601,-85.627122,1
359495,359495,2020-11-08 01:04:05,2475085306462014,fraud_Cartwright-Harris,grocery_pos,289.30,John,Miller,M,153 Mccullough Springs Apt. 857,...,44.2378,-95.2739,1507,Land/geomatics surveyor,1993-10-12,889e6ee72ad30e46dae55fce2da9e185,1383872645,45.087537,-95.220372,1
359492,359492,2020-11-08 01:03:01,630469040731,fraud_Kutch and Sons,grocery_pos,326.03,Meredith,Ayala,F,7107 Henderson Station,...,45.6710,-121.8686,1288,Barrister,1936-05-01,c1d6a28bc8e919c9730556fa5fd706c8,1383872581,45.970483,-122.519803,1


In [42]:
# Drop the columns that are not needed
columns_to_drop = ['first', 'last', 'street', 'city', 'state', 'job', 'trans_num', 'dob', 'city_pop', 'unix_time']

print('Columns dropped: ', (columns_to_drop))

train_df = train_df.drop(columns_to_drop, axis=1)
test_df = test_df.drop(columns_to_drop, axis=1)

Columns dropped:  ['first', 'last', 'street', 'city', 'state', 'job', 'trans_num', 'dob', 'city_pop', 'unix_time']


In [43]:
# Fix the date column

# Convert the date-time column to a datetime object
train_df['date'] = pd.to_datetime(train_df['trans_date_trans_time'])

# Extract the parts of the date
train_df['year'] = train_df['date'].dt.year
train_df['month'] = train_df['date'].dt.month
train_df['day'] = train_df['date'].dt.day
train_df['hour'] = train_df['date'].dt.hour
train_df['minute'] = train_df['date'].dt.minute
train_df['second'] = train_df['date'].dt.second

# Now you can drop the original date-time column
train_df = train_df.drop(['date', 'trans_date_trans_time'], axis=1)

# Convert the date-time column to a datetime object
test_df['date'] = pd.to_datetime(test_df['trans_date_trans_time'])

# Extract the parts of the date
test_df['year'] = test_df['date'].dt.year
test_df['month'] = test_df['date'].dt.month
test_df['day'] = test_df['date'].dt.day
test_df['hour'] = test_df['date'].dt.hour
test_df['minute'] = test_df['date'].dt.minute
test_df['second'] = test_df['date'].dt.second

# Now you can drop the original date-time column
test_df = test_df.drop(['date', 'trans_date_trans_time'], axis=1)

In [44]:
# Convert the categorical columns to numerical values

train_df['merchant'] = train_df['merchant'].astype('category')
train_df['merchant'] = train_df['merchant'].cat.codes

test_df['merchant'] = test_df['merchant'].astype('category')
test_df['merchant'] = test_df['merchant'].cat.codes

train_df['category'] = train_df['category'].astype('category')
train_df['category'] = train_df['category'].cat.codes

test_df['category'] = test_df['category'].astype('category')
test_df['category'] = test_df['category'].cat.codes


train_df['gender'].replace(['F', 'M'], [0, 1], inplace=True)
test_df['gender'].replace(['F', 'M'], [0, 1], inplace=True)

# train_df.head().to_csv('../Data/cleanedTrain.csv', index=False)
# test_df.head().to_csv('../Data/cleanedTest.csv', index=False)
len(train_df.columns), len(test_df.columns)

(18, 18)

In [45]:
# split the data into X and y

X_train = train_df.drop(['is_fraud'], axis=1)
y_train = train_df['is_fraud']
X_test = test_df.drop(['is_fraud'], axis=1)
y_test = test_df['is_fraud']

In [46]:
X_train.head()

Unnamed: 0.1,Unnamed: 0,cc_num,merchant,category,amt,gender,zip,lat,long,merch_lat,merch_long,year,month,day,hour,minute,second
0,0,2703186189652095,514,8,4.97,0,28654,36.0788,-81.1781,36.011293,-82.048315,2019,1,1,0,0,18
1,1,630423337322,241,4,107.23,0,99160,48.8878,-118.2105,49.159047,-118.186462,2019,1,1,0,0,44
2,2,38859492057661,390,0,220.11,1,83252,42.1808,-112.262,43.150704,-112.154481,2019,1,1,0,0,51
3,3,3534093764340240,360,2,45.0,1,59632,46.2306,-112.1138,47.034331,-112.561071,2019,1,1,0,1,16
4,4,375534208663984,297,9,41.96,1,24433,38.4207,-79.4629,38.674999,-78.632459,2019,1,1,0,3,6


In [47]:
y_train.head()

0    0
1    0
2    0
3    0
4    0
Name: is_fraud, dtype: int64

In [48]:
X_test.head()

Unnamed: 0.1,Unnamed: 0,cc_num,merchant,category,amt,gender,zip,lat,long,merch_lat,merch_long,year,month,day,hour,minute,second
0,0,2291163933867244,319,10,2.86,1,29209,33.9659,-80.9355,33.986391,-81.200714,2020,6,21,12,14,25
1,1,3573030041201292,591,10,29.84,0,84002,40.3207,-110.436,39.450498,-109.960431,2020,6,21,12,14,33
2,2,3598215285024754,611,5,41.28,0,11710,40.6729,-73.5365,40.49581,-74.196111,2020,6,21,12,14,53
3,3,3591919803438423,222,9,60.05,1,32780,28.5697,-80.8191,28.812398,-80.883061,2020,6,21,12,15,15
4,4,3526826139003047,292,13,3.19,1,49632,44.2529,-85.017,44.959148,-85.884734,2020,6,21,12,15,17


In [49]:
y_test

0         0
1         0
2         0
3         0
4         0
         ..
555714    0
555715    0
555716    0
555717    0
555718    0
Name: is_fraud, Length: 555719, dtype: int64

In [50]:
model = LogisticRegression()

model.fit(X_train, y_train)

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [51]:
# Make predictions
y_pred = model.predict(X_test)
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

df

Unnamed: 0,Actual,Predicted
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
555714,0,0
555715,0,0
555716,0,0
555717,0,0


In [53]:
# Metrics
print('Logistic Regression')
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('Mean Absolute Percentage Error:', mean_absolute_percentage_error(y_test, y_pred))
print('R2 Score:', r2_score(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))

Logistic Regression
Accuracy: 0.9961401355721147
Mean Absolute Error: 0.0038598644278853163
Mean Squared Error: 0.0038598644278853163
Root Mean Squared Error: 0.062127807203258965
Mean Absolute Percentage Error: 0.0038598644278853163
R2 Score: -0.003874820710510818
Confusion Matrix:
 [[553574      0]
 [  2145      0]]
