In [None]:
# Credit Card Fraud Detection Project

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Load datasets
df_train = pd.read_csv("/content/drive/MyDrive/fraudTrain.csv")
df_test = pd.read_csv("/content/drive/MyDrive/fraudTest.csv")

# Combine for preprocessing consistency
df = pd.concat([df_train, df_test], axis=0)

# Drop irrelevant columns
df.drop(columns=["Unnamed: 0", "trans_num", "first", "last", "dob", "merchant"], inplace=True)

# Convert date and extract hour
df["trans_date_trans_time"] = pd.to_datetime(df["trans_date_trans_time"])
df["hour"] = df["trans_date_trans_time"].dt.hour
df.drop(columns=["trans_date_trans_time"], inplace=True)

# Encode categorical variables
cat_cols = ["gender", "category", "city", "state", "job"]
encoder = LabelEncoder()
for col in cat_cols:
    df[col] = encoder.fit_transform(df[col])

# Define features and target
X = df.drop("is_fraud", axis=1)
y = df["is_fraud"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100)
}

# Train and evaluate
for name, model in models.items():
    print(f"\n=== {name} ===")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print("ROC-AUC:", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))


ValueError: could not convert string to float: '4970 Michelle Burgs'

In [None]:
# Credit Card Fraud Detection Project

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Load datasets
df_train = pd.read_csv("/content/drive/MyDrive/fraudTrain.csv")
df_test = pd.read_csv("/content/drive/MyDrive/fraudTest.csv")

# Combine for preprocessing consistency
df = pd.concat([df_train, df_test], axis=0)

# Drop irrelevant columns, including the street column which contains strings
df.drop(columns=["Unnamed: 0", "trans_num", "first", "last", "dob", "merchant", "street"], inplace=True)

# Convert date and extract hour
df["trans_date_trans_time"] = pd.to_datetime(df["trans_date_trans_time"])
df["hour"] = df["trans_date_trans_time"].dt.hour
df.drop(columns=["trans_date_trans_time"], inplace=True)

# Encode categorical variables
cat_cols = ["gender", "category", "city", "state", "job"]
encoder = LabelEncoder()
for col in cat_cols:
    df[col] = encoder.fit_transform(df[col])

# Define features and target
X = df.drop("is_fraud", axis=1)
y = df["is_fraud"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100)
}

# Train and evaluate
for name, model in models.items():
    print(f"\n=== {name} ===")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print("ROC-AUC:", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))


=== Logistic Regression ===
[[368388    161]
 [  1930      0]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00    368549
           1       0.00      0.00      0.00      1930

    accuracy                           0.99    370479
   macro avg       0.50      0.50      0.50    370479
weighted avg       0.99      0.99      0.99    370479

ROC-AUC: 0.8469547015190801

=== Decision Tree ===
[[368028    521]
 [   453   1477]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    368549
           1       0.74      0.77      0.75      1930

    accuracy                           1.00    370479
   macro avg       0.87      0.88      0.88    370479
weighted avg       1.00      1.00      1.00    370479

ROC-AUC: 0.8819356610886183

=== Random Forest ===
[[368483     66]
 [   553   1377]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    368549
 

In [None]:
# Credit Card Fraud Detection Project

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve

# Load datasets
# Ensure these paths are correct for your environment
df_train = pd.read_csv("/content/drive/MyDrive/fraudTrain.csv")
df_test = pd.read_csv("/content/drive/MyDrive/fraudTest.csv")

# Combine for preprocessing consistency
df = pd.concat([df_train, df_test], axis=0).reset_index(drop=True) # Reset index after concat

# Drop irrelevant columns
# Based on the last successful run and common practice, 'street' is also often dropped.
# We include it here based on the second code block in the history.
df.drop(columns=["Unnamed: 0", "trans_num", "first", "last", "dob", "merchant", "street"], inplace=True, errors='ignore') # Added errors='ignore' just in case column doesn't exist after previous runs

# Convert date and extract hour
# Check if 'trans_date_trans_time' column exists before processing
if "trans_date_trans_time" in df.columns:
    df["trans_date_trans_time"] = pd.to_datetime(df["trans_date_trans_time"])
    df["hour"] = df["trans_date_trans_time"].dt.hour
    df.drop(columns=["trans_date_trans_time"], inplace=True)
else:
    # If the column doesn't exist, it might mean this step was already done.
    # You might want to add a print statement or handle this scenario based on your logic.
    print("'trans_date_trans_time' column not found. Assuming time features were already extracted.")
    # If 'hour' was already extracted, ensure it exists
    if 'hour' not in df.columns:
        print("Warning: 'hour' column was also not found. Time features might be missing.")


# Encode categorical variables
cat_cols = ["gender", "category", "city", "state", "job"]
encoder = LabelEncoder()
for col in cat_cols:
    # Check if the column exists before encoding
    if col in df.columns:
        df[col] = encoder.fit_transform(df[col])
    else:
        print(f"Warning: Categorical column '{col}' not found.")


# Define features and target
X = df.drop("is_fraud", axis=1)
y = df["is_fraud"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100)
}

# Train and evaluate
for name, model in models.items():
    print(f"\n=== {name} ===")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    # Ensure model has predict_proba method before calling roc_auc_score
    if hasattr(model, 'predict_proba'):
        print("ROC-AUC:", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))
    else:
        print("ROC-AUC: N/A (Model does not support predict_proba)")


=== Logistic Regression ===
Confusion Matrix:
 [[368388    161]
 [  1930      0]]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00    368549
           1       0.00      0.00      0.00      1930

    accuracy                           0.99    370479
   macro avg       0.50      0.50      0.50    370479
weighted avg       0.99      0.99      0.99    370479

ROC-AUC: 0.8469547015190801

=== Decision Tree ===
Confusion Matrix:
 [[368010    539]
 [   451   1479]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    368549
           1       0.73      0.77      0.75      1930

    accuracy                           1.00    370479
   macro avg       0.87      0.88      0.87    370479
weighted avg       1.00      1.00      1.00    370479

ROC-AUC: 0.8824293757129643

=== Random Forest ===
Confusion Matrix:
 [[368479     70]
 [   562   1368]]

Cl