In [1]:
import kaggle
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from typing import Any, Tuple
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB



In [2]:
def create_dataframe(location: str) -> pd.DataFrame:
    data = pd.read_csv(location)
    return data

In [3]:
def standardize_dataframe(data: pd.DataFrame) -> pd.DataFrame:
    sc = StandardScaler()
    data['Amount'] = sc.fit_transform(pd.DataFrame(data['Amount']))
    data = data.drop(['id'], axis = 1)
    data = data.drop_duplicates()
    return data

In [4]:
df = standardize_dataframe(create_dataframe('./dataset/creditcard_2023.csv'))

In [5]:
def split_dataset(data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    X = data.drop('Class', axis = 1)
    y = data['Class']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
    return X_train, X_test, y_train, y_test

In [6]:
X_train, X_test, y_train, y_test = split_dataset(df)

In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [8]:
classifier = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "Support Vector Classifier": SVC(),
    "K-Nearest Neighbors Classifier": KNeighborsClassifier(),
    # "Gaussian Naive Bayes": GaussianNB(),
    # "AdaBoost Classifier": AdaBoostClassifier(),
    # "Gradient Boosting Classifier": GradientBoostingClassifier(),
    # "Bagging Classifier": BaggingClassifier(),
    # "Extra Trees Classifier": ExtraTreesClassifier(),
    # "Stochastic Gradient Descent Classifier": SGDClassifier()
}

In [9]:
def classifiers() -> dict:
    return {
        "Logistic Regression": LogisticRegression(),
        "Decision Tree Classifier": DecisionTreeClassifier(),
        "Random Forest Classifier": RandomForestClassifier(),
        "Support Vector Classifier": SVC(),
        "K-Nearest Neighbors Classifier": KNeighborsClassifier(),
        # "Gaussian Naive Bayes": GaussianNB(),
        # "AdaBoost Classifier": AdaBoostClassifier(),
        # "Gradient Boosting Classifier": GradientBoostingClassifier(),
        # "Bagging Classifier": BaggingClassifier(),
        # "Extra Trees Classifier": ExtraTreesClassifier(),
        # "Stochastic Gradient Descent Classifier": SGDClassifier()
    }

In [10]:
def create_prediction_report(model_name: str, model: Any) -> pd.DataFrame:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    df_result = pd.DataFrame(report).transpose()
    df_result['model'] = model_name
    df_result['metrics'] = df_result.index
    df_result.reset_index(drop=True, inplace=True)
    return df_result

In [13]:
def create_whole_prediction_report() -> pd.DataFrame:
    df_result = pd.concat([create_prediction_report(name, clf) for name, clf in classifiers().items()])
    return df_result

In [14]:
create_whole_prediction_report()

Unnamed: 0,precision,recall,f1-score,support,model,metrics
0,0.952399,0.977955,0.965008,56794.0,Logistic Regression,0
1,0.977404,0.95124,0.964145,56932.0,Logistic Regression,1
2,0.964582,0.964582,0.964582,0.964582,Logistic Regression,accuracy
3,0.964901,0.964598,0.964576,113726.0,Logistic Regression,macro avg
4,0.964917,0.964582,0.964576,113726.0,Logistic Regression,weighted avg
0,0.998677,0.997042,0.997859,56794.0,Decision Tree Classifier,0
1,0.997054,0.998683,0.997868,56932.0,Decision Tree Classifier,1
2,0.997863,0.997863,0.997863,0.997863,Decision Tree Classifier,accuracy
3,0.997866,0.997862,0.997863,113726.0,Decision Tree Classifier,macro avg
4,0.997865,0.997863,0.997863,113726.0,Decision Tree Classifier,weighted avg
