## Importing libraries:
---

In [2]:
import os
import zipfile
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from IPython.display import Image

DATA_DIR = "../raw_data/"
ZIP_PATH = os.path.join(DATA_DIR, "creditcardfraud.zip")
CSV_PATH = os.path.join(DATA_DIR, "creditcard.csv")

In [3]:
os.makedirs(DATA_DIR, exist_ok=True)

if not os.path.exists(ZIP_PATH):
    url = "https://www.kaggle.com/api/v1/datasets/download/mlg-ulb/creditcardfraud"
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(ZIP_PATH, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print("Succesfully downloaded.")
    else:
        print(f"Failed to download the file. Status code: {response.status_code}")

if not os.path.exists(CSV_PATH) and os.path.exists(ZIP_PATH):
    with zipfile.ZipFile(ZIP_PATH, "r") as zip_ref:
        zip_ref.extractall(DATA_DIR)
    print("Data extracted succesfully.")

if os.path.exists(CSV_PATH):
    df = pd.read_csv(CSV_PATH)
    print("DataFrame loaded succesfully.")
else:
    print("CSV file not found.")

DataFrame loaded succesfully.


To help understand the problems we are facing, the following image helps to determine the best metric to be used based on the problem we are solving:

![metrics.png](https://machinelearningmastery.com/wp-content/uploads/2019/12/How-to-Choose-a-Metric-for-Imbalanced-Classification-latest.png)

source: [MachineLearningMastery](https://machinelearningmastery.com/tour-of-evaluation-metrics-for-imbalanced-classification/)

In [4]:
df.head() # Display the first few rows of the DataFrame

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


### Stratified splitting:
---

In [5]:
# Split the DataFrame into features and target variable
# 'Class' is the target variable indicating fraud (1) or not fraud (0)
# The rest of the columns are features used for prediction
X = df.drop('Class', axis=1)
y = df['Class']

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size = 0.3, random_state = 42)

### Oversample/undersample before or after splitting data?
---
Main rule: **Always** after.

### Feature Scaling:
---

In [7]:
from typing import List
from sklearn.preprocessing import StandardScaler

# Creating function for scaling
def Standard_Scaler(df: pd.DataFrame, col_names: List[str]) -> pd.DataFrame:
    """
    Standardizes the features in the DataFrame using StandardScaler.
    Args:
        df (pd.DataFrame): The DataFrame containing the features to be scaled.
        col_names (list): List of column names to be scaled.
    Returns:
        pd.DataFrame: The DataFrame with the specified columns scaled.
    """
    features = df[col_names]
    scaler = StandardScaler().fit(features.values)
    features = scaler.transform(features.values)
    df[col_names] = features
    
    return df

In [8]:
X_train = Standard_Scaler (X_train, ['Amount'])
X_test = Standard_Scaler (X_test, ['Amount'])

## Exploratory Data Analysis:
---

In [9]:
labels=["Genuine","Fraud"]

fraud_or_not = df["Class"].value_counts().tolist()
values = [fraud_or_not[0], fraud_or_not[1]]

fig = px.pie(values=df['Class'].value_counts(), names=labels , width=700, height=400, color_discrete_sequence=["skyblue","black"]
             ,title="Fraud vs Genuine transactions")
fig.show()