In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split


In [2]:
file_path = 'creditcard.csv'  # Update the file path if needed
data = pd.read_csv(file_path)

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [4]:
# normalize the data
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)
df = data
# The function "len" counts the number of classes = 1 and saves it as an object "fraud_records"
fraud_records = len(df[df.Class == 1])

# Defines the index for fraud and non-fraud in the lines:
fraud_indices = df[df.Class == 1].index
not_fraud_indices = df[df.Class == 0].index

# Randomly collect equal samples of each type:
under_sample_indices = np.random.choice(not_fraud_indices, fraud_records, False)
df_undersampled = df.iloc[np.concatenate([fraud_indices, under_sample_indices]),:]
X_undersampled = df_undersampled.iloc[:,1:30]
Y_undersampled = df_undersampled.Class
X_undersampled_train, X_undersampled_test, Y_undersampled_train, Y_undersampled_test = train_test_split(X_undersampled, Y_undersampled, test_size = 0.3)




In [5]:
# check the size of X_undersampled_train and X_undersampled_test
print(X_undersampled_train.shape)
print(X_undersampled_test.shape)
print(Y_undersampled_train.shape)
print(Y_undersampled_test.shape)

(688, 29)
(296, 29)
(688,)
(296,)


In [6]:
import numpy as np

class LogisticRegression:
    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.learning_rate = learning_rate
        self.num_iterations = n_iterations
        self.weights = None
        self.bias = None

    def _sigmoid(self, z):
        # Clip the input values to avoid overflow in the exponential function
        z = np.clip(z, -500, 500)
        return 1 / (1 + np.exp(-z))


    def fit(self, X, y):
        num_samples, num_features = X.shape
        self.weights = np.zeros(num_features)
        self.bias = 0

        for _ in range(self.num_iterations):
            model = np.dot(X, self.weights) + self.bias
            predictions = self._sigmoid(model)

            dw = (1 / num_samples) * np.dot(X.T, (predictions - y))
            db = (1 / num_samples) * np.sum(predictions - y)

            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X):
        model = np.dot(X, self.weights) + self.bias
        predictions = self._sigmoid(model)
        return [1 if i > 0.7 else 0 for i in predictions]


In [7]:
# train the model
model = LogisticRegression(learning_rate=0.01, n_iterations=1000)
model.fit(X_undersampled_train, Y_undersampled_train)

In [8]:
# make predictions
y_pred = model.predict(X_undersampled_test)

In [9]:
# get the metrics
print("Accuracy:", accuracy_score(Y_undersampled_test, y_pred))
print("Precision:", precision_score(Y_undersampled_test, y_pred))
print("Recall:", recall_score(Y_undersampled_test, y_pred))
print("F1 Score:", f1_score(Y_undersampled_test, y_pred))
print("ROC AUC Score:", roc_auc_score(Y_undersampled_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(Y_undersampled_test, y_pred))

Accuracy: 0.7027027027027027
Precision: 0.6458333333333334
Recall: 0.9810126582278481
F1 Score: 0.7788944723618091
ROC AUC Score: 0.6825353146211705
Confusion Matrix:
 [[ 53  85]
 [  3 155]]


In [10]:
# Load the data
data = pd.read_csv('creditcard.csv')

# Subsample the data such that 90& of the data is fradulent and 10% is non-fraudulent
fraudulent = data[data['Class'] == 1]
non_fraudulent = data[data['Class'] == 0]

# Randomly sample non-fraudulent transactions
non_fraudulent_sample = non_fraudulent.sample(n=len(fraudulent)*9, random_state=42)

# Combine the fraudulent and non-fraudulent samples
subsample = pd.concat([fraudulent, non_fraudulent_sample])

# Split the data into features and target
X = subsample.drop(['Class','Time'], axis=1)
y = subsample['Class'].values

# Split into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.9, random_state=42)

In [11]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [12]:
# get the metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9677055103884372
Precision: 0.9875389408099688
Recall: 0.6951754385964912
F1 Score: 0.8159588159588159
ROC AUC Score: 0.8470841946255366
Confusion Matrix:
 [[3968    4]
 [ 139  317]]
