###Importing the required modules

In [102]:
import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sqlalchemy import create_engine
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
import joblib
import os

In [89]:
pip install opendatasets --upgrade



###Loading the dataset

In [49]:
import opendatasets as od
download_url =('https://www.kaggle.com/datasets/kelvinkelue/credit-card-fraud-prediction')
od.download(download_url)

Skipping, found downloaded files in "./credit-card-fraud-prediction" (use force=True to force download)


fbb662301487caed1a3021a42e342d13fbb662301487caed1a3021a42e342d13

In [50]:
data = pd.read_csv('/content/credit-card-fraud-prediction/fraud test.csv')
data.head(2)

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,trans_date,trans_time
0,0,21/06/2020 12:14,2291160000000000.0,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,333497,Mechanical engineer,19/03/1968,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0,21/06/2020,12:14
1,1,21/06/2020 12:14,3573030000000000.0,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,302,"Sales professional, IT",17/01/1990,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0,21/06/2020,12:14


###Exploring the Dataset

In [51]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 25 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             555719 non-null  int64  
 1   trans_date_trans_time  555719 non-null  object 
 2   cc_num                 555719 non-null  float64
 3   merchant               555719 non-null  object 
 4   category               555719 non-null  object 
 5   amt                    555719 non-null  float64
 6   first                  555719 non-null  object 
 7   last                   555719 non-null  object 
 8   gender                 555719 non-null  object 
 9   street                 555719 non-null  object 
 10  city                   555719 non-null  object 
 11  state                  555719 non-null  object 
 12  zip                    555719 non-null  int64  
 13  lat                    555719 non-null  float64
 14  long                   555719 non-nu

In [52]:
data.isnull().sum()

Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
trans_date               0
trans_time               0
dtype: int64

In [53]:
data.duplicated().sum()

0

###Feature Engineering


Splitting the date and time columns and deleting the combined colum

In [54]:
data[['trans_date', 'trans_time']] = data['trans_date_trans_time'].str.split(' ', expand=True)
data.to_csv('/content/credit-card-fraud-prediction/fraud test.csv', index=False)

Deleting the unwanted columns

In [55]:
data.drop(columns=['trans_date_trans_time','unix_time','Unnamed: 0','trans_num'], inplace =True)

In [56]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 21 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   cc_num      555719 non-null  float64
 1   merchant    555719 non-null  object 
 2   category    555719 non-null  object 
 3   amt         555719 non-null  float64
 4   first       555719 non-null  object 
 5   last        555719 non-null  object 
 6   gender      555719 non-null  object 
 7   street      555719 non-null  object 
 8   city        555719 non-null  object 
 9   state       555719 non-null  object 
 10  zip         555719 non-null  int64  
 11  lat         555719 non-null  float64
 12  long        555719 non-null  float64
 13  city_pop    555719 non-null  int64  
 14  job         555719 non-null  object 
 15  dob         555719 non-null  object 
 16  merch_lat   555719 non-null  float64
 17  merch_long  555719 non-null  float64
 18  is_fraud    555719 non-null  int64  
 19  tr

Renaming the Columns

In [57]:
data=data.rename(columns={'cc_num': 'credit_card_number',
                            'amt': 'amount',
                            'first': 'first_name',
                            'last': 'last_name',
                            'lat':'latitude',
                            'long':'longitude',
                            'merch_lat':'merchant_latitude',
                            'merch_long':'merchant_longitude',
                            'trans_date':'transction_date',
                            'trans_time':'transaction_time'})

In [58]:
data.head(2)

Unnamed: 0,credit_card_number,merchant,category,amount,first_name,last_name,gender,street,city,state,...,latitude,longitude,city_pop,job,dob,merchant_latitude,merchant_longitude,is_fraud,transction_date,transaction_time
0,2291160000000000.0,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,Columbia,SC,...,33.9659,-80.9355,333497,Mechanical engineer,19/03/1968,33.986391,-81.200714,0,21/06/2020,12:14
1,3573030000000000.0,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,Altonah,UT,...,40.3207,-110.436,302,"Sales professional, IT",17/01/1990,39.450498,-109.960431,0,21/06/2020,12:14


In [59]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 21 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   credit_card_number  555719 non-null  float64
 1   merchant            555719 non-null  object 
 2   category            555719 non-null  object 
 3   amount              555719 non-null  float64
 4   first_name          555719 non-null  object 
 5   last_name           555719 non-null  object 
 6   gender              555719 non-null  object 
 7   street              555719 non-null  object 
 8   city                555719 non-null  object 
 9   state               555719 non-null  object 
 10  zip                 555719 non-null  int64  
 11  latitude            555719 non-null  float64
 12  longitude           555719 non-null  float64
 13  city_pop            555719 non-null  int64  
 14  job                 555719 non-null  object 
 15  dob                 555719 non-nul

In [60]:
data['is_fraud']

0         0
1         0
2         0
3         0
4         0
         ..
555714    0
555715    0
555716    0
555717    0
555718    0
Name: is_fraud, Length: 555719, dtype: int64

### Creating the model

Creating X and Y variables as X=Features, Y=Target

In [70]:
target=data['is_fraud']

In [71]:
features = data.drop(columns=['transaction_time', 'transction_date', 'is_fraud', 'first_name', 'last_name', 'street'])

Identify categorical and numerical columns

In [72]:
categorical_cols = features.select_dtypes(include=['object']).columns.tolist()
numerical_cols = features.select_dtypes(include=['float64', 'int64']).columns.tolist()

Define the preprocessing steps

In [73]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

Define the model pipeline

In [74]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

Split the data into training and testing sets

In [75]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42, stratify=target)

Train the model

In [77]:
model.fit(X_train, y_train)

Exporting the model to joblib for further predictions

In [94]:
joblib.dump(model, 'fraud_detection_model.pkl')

['fraud_detection_model.pkl']

Make predictions

In [78]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

Evaluate the model

In [79]:
report = classification_report(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

In [80]:
print(report)
print(f"ROC-AUC: {roc_auc}")

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    166072
           1       0.96      0.46      0.63       644

    accuracy                           1.00    166716
   macro avg       0.98      0.73      0.81    166716
weighted avg       1.00      1.00      1.00    166716

ROC-AUC: 0.9524433239911807
