# Credit Card Fraud Detection

### **Comparing accuracies of Logistic Regression, Random Forests and Decision Trees**

In [None]:
# Importing Libraries

import numpy as np
import pandas as pd
import sklearn

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

### **Load Training Data**

In [None]:
# Load training data
df_train = pd.read_csv('/kaggle/input/fraud-detection/fraudTrain.csv')
df_train.head(3)

### **Load Test Data**

In [None]:
# Load test data
df_test = pd.read_csv('/kaggle/input/fraud-detection/fraudTest.csv')
df_test.head(3)

### **Data Preprocessing**

In [None]:
# Size of both dataframes
print(f"The shape of train set: {df_train.shape}")
print(f"Test shape of test set: {df_test.shape}")

In [None]:
df_train.info()

In [None]:
# Number of null values in each column
df_train.isnull().sum()

In [None]:
df_test.info()

In [None]:
# Number of null values in each column
df_test.isnull().sum()

### **Cleaning/Encoding**

In [None]:
# Function to remove non-important columns
def clean_data(clean):
     clean.drop(["Unnamed: 0",'cc_num','first', 'last', 'street', 'city', 'state', 'zip', 'dob', 'trans_num','trans_date_trans_time'],axis=1, inplace=True)
     clean.dropna()
     return clean

In [None]:
# Function called on training data
clean_data(df_train)

In [None]:
# Function called on test data
clean_data(df_test)

In [None]:
# Columns with categorical values
df_train.select_dtypes(include = ['object'])

In [None]:
# Encoding to convert categorical data into numerical data
encoder=LabelEncoder()
def encode(data):
    data['merchant']=encoder.fit_transform(data['merchant'])
    data["category"] = encoder.fit_transform(data["category"])
    data["gender"] = encoder.fit_transform(data["gender"])
    data["job"] = encoder.fit_transform(data["job"])
    return data

In [None]:
# Function called on training data
encode(df_train)

In [None]:
# Function called on training data
encode(df_test)

### **Visualise Data**

In [None]:
from matplotlib import pyplot as plt
exit_counts = df_train["is_fraud"].value_counts()
plt.figure(figsize=(7, 7))
plt.subplot(1, 2, 1)  # Subplot for the pie chart
plt.pie(exit_counts, labels=["No", "YES"], autopct="%0.0f%%")
plt.title("is_fraud Counts")
plt.tight_layout()  # Adjust layout to prevent overlapping
plt.show()

In [None]:
import seaborn as sns
pd.options.display.float_format = "{:,.2f}".format

corr_matrix = df_train.corr(method = 'pearson')

mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
corr_matrix[(corr_matrix < 0.3) & (corr_matrix > -0.3)] = 0

cmap = "mako"

# the heatmap
sns.heatmap(corr_matrix, mask=mask, vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot_kws={"size": 9, "color": "black"}, square=True, cmap=cmap, annot=True)

### **Split Data and Train Model**

In [None]:
x=df_train.drop(columns=['is_fraud'])
y=df_train['is_fraud']

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
model1 = LogisticRegression()
model2 = RandomForestClassifier()
model3 = DecisionTreeClassifier()

In [None]:
from sklearn.metrics import accuracy_score

# Define a function for each metric
def acc_score(test, pred):
    acc_ = accuracy_score(test, pred)
    return acc_

# Print the scores
def print_score(test, pred, model):

    print(f"Classifier: {model}")
    print(f"ACCURACY: {accuracy_score(test, pred)}")

### **Logistic Regression**

In [None]:
model1.fit(x_train,y_train)

In [None]:
y_pred = model1.predict(x_test)

In [None]:
print_score(y_test, y_pred, "Logistic Regression")

In [None]:
model_list = []
acc_list = []

model_list.append(model1.__class__.__name__)
acc_list.append(round(acc_score(y_test, y_pred), 4))

### **Random Forests**

In [None]:
model2.fit(x_train,y_train)

In [None]:
y_pred1 = model2.predict(x_test)

In [None]:
print_score(y_test,y_pred1,"Random Forest")

In [None]:
model_list.append(model2.__class__.__name__)
acc_list.append(round(acc_score(y_test, y_pred), 4))

### **Decision Trees**

In [None]:
model3.fit(x_train,y_train)

In [None]:
Y_Pred = model3.predict(x_test)

In [None]:
print_score(y_test, Y_Pred, "Decision Tree")

In [None]:
model_list.append(model3.__class__.__name__)
acc_list.append(round(acc_score(y_test, Y_Pred), 3))

### **Comparison and Results**

In [None]:
model_results = pd.DataFrame({"Model": model_list,
                              "Accuracy_Score": acc_list,
                              })

In [None]:
model_results