In [4]:
import timeit
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
import streamlit as st
import joblib
import os
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import confusion_matrix, classification_report, matthews_corrcoef
from imblearn.over_sampling import SMOTE

# Suppress warnings
warnings.filterwarnings("ignore")

# Create a directory to save the models if it doesn't exist
models_dir = "models"
os.makedirs(models_dir, exist_ok=True)

# Streamlit App Title
st.title('Credit Card Fraud Detection - Model Training')

# Load the dataset with caching
@st.cache_data
def load_data():
    df = pd.read_csv('creditcard.csv')
    return df

df = load_data()

# Display DataFrame details
if st.sidebar.checkbox('Show what the dataframe looks like'):
    st.write(df.head(100))
    st.write('Shape of the dataframe: ', df.shape)
    st.write('Data description:', df.describe())

# Fraud and Valid Transaction Analysis
fraud = df[df['Class'] == 1]
valid = df[df['Class'] == 0]
outlier_percentage = (len(fraud) / len(valid)) * 100

if st.sidebar.checkbox('Show fraud and valid transaction details'):
    st.write(f'Fraudulent transactions are: {outlier_percentage:.3f}%')
    st.write('Fraud Cases:', len(fraud))
    st.write('Valid Cases:', len(valid))

# Splitting the features and labels
X = df.drop(columns=['Class'])
y = df['Class']

# Train-test split
size = st.sidebar.slider('Test Set Size', min_value=0.2, max_value=0.4)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=size, random_state=42)

# Initialize classifiers
logreg = LogisticRegression()
knn = KNeighborsClassifier()
rforest = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42, n_jobs=-1)
etree = ExtraTreesClassifier(n_estimators=50, max_depth=10, random_state=42, n_jobs=-1)

# Handling class imbalance with SMOTE
smt = SMOTE(random_state=42)
X_train_bal, y_train_bal = smt.fit_resample(X_train, y_train)

# Dictionary to hold models and filenames
models = {
    'Logistic Regression': (logreg, os.path.join(models_dir, 'logistic_regression.pkl')),
    'kNN': (knn, os.path.join(models_dir, 'knn.pkl')),
    'Random Forest': (rforest, os.path.join(models_dir, 'random_forest.pkl')),
    'Extra Trees': (etree, os.path.join(models_dir, 'extra_trees.pkl'))
}

# Train and save each model
st.write("Training and saving models...")
for model_name, (model, filename) in models.items():
    st.write(f"Training {model_name}...")
    start_time = timeit.default_timer()
    model.fit(X_train_bal, y_train_bal)
    elapsed = timeit.default_timer() - start_time
    st.write(f"Training Time for {model_name}: {elapsed:.2f} seconds")

    # Save the model as a .pkl file
    joblib.dump(model, filename)
    st.write(f"Model saved as '{filename}'")


2024-11-12 23:04:36.798 No runtime found, using MemoryCacheStorageManager
