In [1]:
#Importing the neccessary libraries for data handling
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

In [2]:
#Importing the dataset 
df = pd.read_csv(r"C:\Users\HP\Downloads\credit_card_fraud_10k.csv") 

In [3]:
df.head()

Unnamed: 0,transaction_id,amount,transaction_hour,merchant_category,foreign_transaction,location_mismatch,device_trust_score,velocity_last_24h,cardholder_age,is_fraud
0,1,84.47,22,Electronics,0,0,66,3,40,0
1,2,541.82,3,Travel,1,0,87,1,64,0
2,3,237.01,17,Grocery,0,0,49,1,61,0
3,4,164.33,4,Grocery,0,1,72,3,34,0
4,5,30.53,15,Food,0,0,79,0,44,0


In [4]:
df.describe()

Unnamed: 0,transaction_id,amount,transaction_hour,foreign_transaction,location_mismatch,device_trust_score,velocity_last_24h,cardholder_age,is_fraud
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,175.949849,11.5933,0.0978,0.0857,61.7989,2.0089,43.4687,0.0151
std,2886.89568,175.392827,6.922708,0.297059,0.279935,21.487053,1.432559,14.979147,0.121957
min,1.0,0.0,0.0,0.0,0.0,25.0,0.0,18.0,0.0
25%,2500.75,50.905,6.0,0.0,0.0,43.0,1.0,30.0,0.0
50%,5000.5,122.095,12.0,0.0,0.0,62.0,2.0,44.0,0.0
75%,7500.25,242.48,18.0,0.0,0.0,80.0,3.0,56.0,0.0
max,10000.0,1471.04,23.0,1.0,1.0,99.0,9.0,69.0,1.0


In [5]:
df['is_fraud'].value_counts()

is_fraud
0    9849
1     151
Name: count, dtype: int64

Observation : Imbalanced dataset 

In [6]:
#Checking the nulls 
df.isna().sum()

transaction_id         0
amount                 0
transaction_hour       0
merchant_category      0
foreign_transaction    0
location_mismatch      0
device_trust_score     0
velocity_last_24h      0
cardholder_age         0
is_fraud               0
dtype: int64

In [7]:
df.duplicated().sum()

np.int64(0)

Observation : It is cleaned dataset 

In [8]:
df.columns

Index(['transaction_id', 'amount', 'transaction_hour', 'merchant_category',
       'foreign_transaction', 'location_mismatch', 'device_trust_score',
       'velocity_last_24h', 'cardholder_age', 'is_fraud'],
      dtype='object')

In [9]:
df.drop('transaction_id', inplace=True, errors='ignore')

In [10]:
#Importing the pipeline to start the building the model 
from imblearn.pipeline import Pipeline

In [11]:
from sklearn.preprocessing import OneHotEncoder 
from sklearn.preprocessing import StandardScaler

In [12]:
#Creating a pipeline for the preprocessing of the categorical column 
steps = [('Encoder',OneHotEncoder())]
categorical_preprocessing = Pipeline(steps)

In [13]:
categorical_preprocessing

In [14]:
#Creating the pipeline for preprocessing of the numerical columns  
steps_1 = [('Scalar',StandardScaler())]
numerical_preprocessor = Pipeline(steps_1)

In [15]:
numerical_preprocessor

In [16]:
from sklearn.compose import ColumnTransformer 
df.columns 

Index(['transaction_id', 'amount', 'transaction_hour', 'merchant_category',
       'foreign_transaction', 'location_mismatch', 'device_trust_score',
       'velocity_last_24h', 'cardholder_age', 'is_fraud'],
      dtype='object')

In [17]:
#Splitting the numerical and categorical columns 
numerical_cols = ['amount','transaction_hour','foreign_transaction','location_mismatch','device_trust_score','velocity_last_24h','cardholder_age'] 
categorical_cols = ['merchant_category'] 

In [18]:
preprocessing = ColumnTransformer(
    transformers=[
        ('numerical', numerical_preprocessor, numerical_cols),
        ('categorical', categorical_preprocessing, categorical_cols) , 
    ]
)

In [19]:
preprocessing 


In [20]:
from sklearn.model_selection import train_test_split

In [21]:
from imblearn.over_sampling import SMOTE
X = df.iloc[:,:-1] 
y = df.iloc[:,-1]  


In [22]:
#Splitting the data 
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=42) 

In [23]:
#Importing the model 
from sklearn.linear_model import LogisticRegression 

In [24]:
steps_2 = [('preprocessing',preprocessing) ,('smote',SMOTE()),('Logistic',LogisticRegression())]
pre_model = Pipeline(steps_2)

In [25]:
pre_model 

In [26]:
pre_model.fit(X_train,y_train)

In [60]:
pre_model.score(X_test,y_test) * 100

96.93939393939394

In [29]:
from sklearn.metrics import precision_score,recall_score,classification_report,confusion_matrix

In [30]:
y_pred = pre_model.predict(X_test)

In [31]:
#Evalution metrics 
cm = confusion_matrix(y_test,y_pred) 
cl_report = classification_report(y_test,y_pred) 
p_score = precision_score(y_pred,y_test) 
r_score = recall_score(y_pred,y_test) 

In [32]:
cm

array([[3145,   96],
       [   5,   54]])

In [61]:
p_score * 100

91.52542372881356

In [62]:
r_score * 100

36.0

In this particular problem the precision can be considered beacuase the dataset is unbalanced

In [35]:
#Importing another machine learning algorithm 
from sklearn.tree import DecisionTreeClassifier

In [38]:
steps_5 = [('preprocessor',preprocessing),('smote',SMOTE()),('decisiontree',DecisionTreeClassifier())]
model = Pipeline(steps_5)

In [39]:
model.fit(X_train,y_train)

In [40]:
model.score(X_test,y_test)

0.9896969696969697

In [41]:
y_pred_2 = model.predict(X_test)

In [42]:
cm_tree = confusion_matrix(y_pred_2,y_test) 
cl_report_tree = classification_report(y_pred_2,y_test) 
p_score_tree = precision_score(y_pred_2,y_test) 
r_score_tree = recall_score(y_pred_2,y_test)

In [43]:
cm_tree

array([[3225,   18],
       [  16,   41]])

In [44]:
cl_report_tree

'              precision    recall  f1-score   support\n\n           0       1.00      0.99      0.99      3243\n           1       0.69      0.72      0.71        57\n\n    accuracy                           0.99      3300\n   macro avg       0.84      0.86      0.85      3300\nweighted avg       0.99      0.99      0.99      3300\n'

In [45]:
p_score_tree

0.6949152542372882

In [46]:
r_score_tree

0.7192982456140351

In [47]:
#Importing the RandomForestclassifier
from sklearn.ensemble import RandomForestClassifier

In [48]:
steps_6 = [('preprocessor',preprocessing),('smote',SMOTE()),('decisiontree',RandomForestClassifier())]
random_model = Pipeline(steps_6)

In [49]:
random_model

In [50]:
random_model.fit(X_train,y_train)

In [53]:
random_model.score(X_test,y_test)

0.9906060606060606

In [54]:
y_pred_3 = random_model.predict(X_test)

In [56]:
random_cm = confusion_matrix(y_pred_3,y_test) 
cl_report_random = classification_report(y_pred_3,y_test) 
p_score_random = precision_score(y_pred_3,y_test) 
r_score_random = recall_score(y_pred_3,y_test)

In [57]:
random_cm

array([[3241,   31],
       [   0,   28]])

In [58]:
p_score_random

0.4745762711864407

In [59]:
r_score_random

1.0

COMPARING THE THREE MODELS PRECISION SCORE THEN THE LOGISTIC REGRESSION WORKS BETTER THAN THE DECISION TREE AND RANDOM FOREST 
