# Task 2 - Model Building and Training

In this notebook, we:
- Prepare data
- Build and compare Logistic Regression and XGBoost/RandomForest models
- Evaluate model performance on imbalanced datasets

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score, average_precision_score, precision_recall_curve
import matplotlib.pyplot as plt
import seaborn as sns

## Load Datasets

In [4]:
# Load CreditCard dataset
creditcard_df = pd.read_csv('/content/drive/MyDrive/week 8/Data/creditcard.csv.zip')
creditcard_df['Class'] = creditcard_df['Class'].astype(int)

# Load Fraud_Data dataset
fraud_df = pd.read_csv('/content/drive/MyDrive/week 8/Data/Fraud_Data.csv')
fraud_df['class'] = fraud_df['class'].astype(int)

## Split Data into Train and Test Sets

In [7]:
# CreditCard dataset
X_cc = creditcard_df.drop('Class', axis=1)
y_cc = creditcard_df['Class']
X_cc_train, X_cc_test, y_cc_train, y_cc_test = train_test_split(X_cc, y_cc, test_size=0.3, random_state=42, stratify=y_cc)

# Fraud_Data dataset
X_fd = fraud_df.drop(['class', 'signup_time', 'purchase_time', 'device_id', 'source', 'browser', 'sex'], axis=1)
y_fd = fraud_df['class']
X_fd_train, X_fd_test, y_fd_train, y_fd_test = train_test_split(X_fd, y_fd, test_size=0.3, random_state=42, stratify=y_fd)

## Train Logistic Regression

In [8]:
# Train Logistic Regression on both datasets
lr_cc = LogisticRegression(max_iter=1000)
lr_cc.fit(X_cc_train, y_cc_train)

lr_fd = LogisticRegression(max_iter=1000)
lr_fd.fit(X_fd_train, y_fd_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Train XGBoost Classifier

In [9]:
# XGBoost on both datasets
xgb_cc = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_cc.fit(X_cc_train, y_cc_train)

xgb_fd = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_fd.fit(X_fd_train, y_fd_train)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



## Evaluation Function

In [10]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
    print(f"AUC-PR: {average_precision_score(y_test, y_proba):.4f}")

## Evaluate Models

In [11]:
print("--- Logistic Regression on CreditCard ---")
evaluate_model(lr_cc, X_cc_test, y_cc_test)

print("\n--- XGBoost on CreditCard ---")
evaluate_model(xgb_cc, X_cc_test, y_cc_test)

print("\n--- Logistic Regression on Fraud_Data ---")
evaluate_model(lr_fd, X_fd_test, y_fd_test)

print("\n--- XGBoost on Fraud_Data ---")
evaluate_model(xgb_fd, X_fd_test, y_fd_test)

--- Logistic Regression on CreditCard ---
Confusion Matrix:
[[85279    16]
 [   47   101]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.86      0.68      0.76       148

    accuracy                           1.00     85443
   macro avg       0.93      0.84      0.88     85443
weighted avg       1.00      1.00      1.00     85443

F1 Score: 0.7623
AUC-PR: 0.6793

--- XGBoost on CreditCard ---
Confusion Matrix:
[[85288     7]
 [   37   111]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.94      0.75      0.83       148

    accuracy                           1.00     85443
   macro avg       0.97      0.87      0.92     85443
weighted avg       1.00      1.00      1.00     85443

F1 Score: 0.8346
AUC-PR: 0.8359

--- Logistic Regression on Fraud_Data ---
Confusion Matrix:
[[

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Confusion Matrix:
[[41031    58]
 [ 4004   241]]

Classification Report:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     41089
           1       0.81      0.06      0.11      4245

    accuracy                           0.91     45334
   macro avg       0.86      0.53      0.53     45334
weighted avg       0.90      0.91      0.87     45334

F1 Score: 0.1061
AUC-PR: 0.3593


## Model Comparison

Based on F1-Score and AUC-PR (which are robust to class imbalance), we choose the better performing model.