In [43]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
# from catboost import CatBoostClassifier
# from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [44]:
# Define the haversine function
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # radius of Earth in kilometers
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    dphi = np.radians(lat2 - lat1)
    dlambda = np.radians(lon2 - lon1)
    
    a = np.sin(dphi / 2) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(dlambda / 2) ** 2
    return 2 * R * np.arctan2(np.sqrt(a), np.sqrt(1 - a))



In [45]:
# Load training and testing data
train = pd.read_csv('fraudTrain.csv')
test = pd.read_csv('fraudTest.csv')


In [46]:
pd.options.display.max_columns = None
train.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,28654,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,99160,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,ID,83252,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,MT,59632,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,VA,24433,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [47]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  float64
 14  long              

In [48]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             555719 non-null  int64  
 1   trans_date_trans_time  555719 non-null  object 
 2   cc_num                 555719 non-null  int64  
 3   merchant               555719 non-null  object 
 4   category               555719 non-null  object 
 5   amt                    555719 non-null  float64
 6   first                  555719 non-null  object 
 7   last                   555719 non-null  object 
 8   gender                 555719 non-null  object 
 9   street                 555719 non-null  object 
 10  city                   555719 non-null  object 
 11  state                  555719 non-null  object 
 12  zip                    555719 non-null  int64  
 13  lat                    555719 non-null  float64
 14  long                   555719 non-nu

In [49]:
train.shape

(1296675, 23)

In [50]:
test.shape

(555719, 23)

In [51]:
train['is_fraud'].value_counts()

is_fraud
0    1289169
1       7506
Name: count, dtype: int64

In [52]:
legit_train = train[train.is_fraud==0]
fraud_train = train[train.is_fraud==1]
legit_train_sample = legit_train.sample(n=7506)
train = pd.concat([legit_train_sample,fraud_train], axis=0)
train.shape

(15012, 23)

In [53]:
test['is_fraud'].value_counts()

is_fraud
0    553574
1      2145
Name: count, dtype: int64

In [54]:
legit_test = test[test.is_fraud==0]
fraud_test = test[test.is_fraud==1]
legit_test_sample = legit_test.sample(n=2145)
test = pd.concat([legit_test_sample,fraud_test],axis=0)
test.shape

(4290, 23)

In [55]:
# Drop irrelevant columns
train = train.drop(columns=['Unnamed: 0', 'cc_num', 'first', 'last', 'street', 'job', 'trans_num','zip','lat','long','unix_time','merch_lat','merch_long','gender'])
test = test.drop(columns=['Unnamed: 0', 'cc_num', 'first', 'last', 'street', 'job', 'trans_num','zip','lat','long','unix_time','merch_lat','merch_long','gender'])


In [56]:
train.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,city,state,city_pop,dob,is_fraud
563213,2019-08-27 18:12:05,fraud_Schumm PLC,health_fitness,56.89,Lake Jackson,TX,28739,1999-12-27,0
1063634,2020-03-17 08:45:30,"fraud_Schoen, Kuphal and Nitzsche",grocery_pos,182.23,Grandview,TX,5875,1992-07-24,0
790243,2019-12-02 00:48:56,"fraud_Schaefer, McGlynn and Bosco",gas_transport,71.72,Sontag,MS,1196,1958-06-11,0
938526,2020-01-09 08:13:03,"fraud_Jenkins, Hauck and Friesen",gas_transport,58.28,Glendale,CA,172817,1982-07-30,0
1038422,2020-03-06 20:22:04,fraud_Pagac LLC,shopping_pos,1.51,San Antonio,TX,1595797,1995-10-17,0


## Feature Engineering

In [57]:
def preprocess_data(data):
    # Extract features from trans_date_trans_time
    data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'])

    # Calculate the age of the cardholder
    data['dob'] = pd.to_datetime(data['dob'])
    data['age'] = data['trans_date_trans_time'].dt.year - data['dob'].dt.year

    # Calculate distance between transaction location and merchant location
    # data['distance'] = haversine(data['lat'], data['long'], data['merch_lat'], data['merch_long'])

    # Drop unnecessary latitude and longitude columns
    data = data.drop(columns=['trans_date_trans_time', 'dob'])
    
    return data

In [58]:
# Preprocess the training and testing data
train = preprocess_data(train)
test = preprocess_data(test)

In [59]:
train.head()

Unnamed: 0,merchant,category,amt,city,state,city_pop,is_fraud,age
563213,fraud_Schumm PLC,health_fitness,56.89,Lake Jackson,TX,28739,0,20
1063634,"fraud_Schoen, Kuphal and Nitzsche",grocery_pos,182.23,Grandview,TX,5875,0,28
790243,"fraud_Schaefer, McGlynn and Bosco",gas_transport,71.72,Sontag,MS,1196,0,61
938526,"fraud_Jenkins, Hauck and Friesen",gas_transport,58.28,Glendale,CA,172817,0,38
1038422,fraud_Pagac LLC,shopping_pos,1.51,San Antonio,TX,1595797,0,25


## Separate the target variables

In [60]:
X_train = train.drop(columns=['is_fraud'])
y_train = train['is_fraud']
X_test = test.drop(columns=['is_fraud'])
y_test = test['is_fraud']

## Preprocess categorical variables and standardize numerical variables


In [61]:
categorical_features = ['category', 'state', 'merchant', 'city']
numerical_features = ['amt', 'city_pop', 'age']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])


In [62]:
# # Define individual models
# clf1 = RandomForestClassifier(n_estimators=100, random_state=42)
# clf2 = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
# clf3 = LGBMClassifier(random_state=42)
# clf4 = CatBoostClassifier(random_state=42, verbose=0)

## Create and train the model pipeline


In [63]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    # ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
    ('classifier', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))
    # ('classifier', LGBMClassifier(random_state=42))
    # ('classifier', CatBoostClassifier(random_state=42, verbose=0))
    # ('classifier', VotingClassifier(estimators=[
    #     ('rf', clf1),
    #     ('xgb', clf2),
    #     ('lgb', clf3),
    #     ('cat', clf4)
    # ], voting='soft'))
])

model.fit(X_train, y_train)

## Evaluate the model Performance

In [64]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[2055   90]
 [  61 2084]]
              precision    recall  f1-score   support

           0       0.97      0.96      0.96      2145
           1       0.96      0.97      0.97      2145

    accuracy                           0.96      4290
   macro avg       0.96      0.96      0.96      4290
weighted avg       0.96      0.96      0.96      4290



In [65]:
import pickle

# Save the preprocessor and model pipeline to a file
with open('fraud_detection_model_xgb.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save the preprocessor separately
with open('preprocessor_xgb.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)
