# Task 3 - Model Explainability with SHAP

Using TreeExplainer for Random Forest model. Force plot is excluded to avoid crashes.

In [1]:
!pip install shap imblearn scikit-learn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Imports
import pandas as pd
import numpy as np
import shap
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

## Load and Prepare Data

In [6]:
# Load data
fraud_data = pd.read_csv('/content/drive/MyDrive/week 8/Data/Fraud_Data.csv')
fraud_data.drop_duplicates(inplace=True)
fraud_data.dropna(inplace=True)
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])
fraud_data['time_since_signup'] = (fraud_data['purchase_time'] - fraud_data['signup_time']).dt.total_seconds()
fraud_data['hour_of_day'] = fraud_data['purchase_time'].dt.hour
fraud_data['day_of_week'] = fraud_data['purchase_time'].dt.dayofweek
fraud_data.drop(['signup_time', 'purchase_time'], axis=1, inplace=True)
fraud_data = pd.get_dummies(fraud_data, columns=['source', 'browser', 'sex'], drop_first=True)

## Train-Test Split and Model Training

In [9]:
# Feature/Target split
X = fraud_data.drop(['class', 'device_id'], axis=1)
y = fraud_data['class']
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
# Handle imbalance
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)
# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_res)
X_test_scaled = scaler.transform(X_test)
# Reconstruct DataFrames for SHAP
X_train_df = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_df = pd.DataFrame(X_test_scaled, columns=X.columns)
# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_df, y_res)
print(classification_report(y_test, model.predict(X_test_df)))

              precision    recall  f1-score   support

           0       0.95      0.97      0.96     27393
           1       0.63      0.55      0.58      2830

    accuracy                           0.93     30223
   macro avg       0.79      0.76      0.77     30223
weighted avg       0.92      0.93      0.92     30223



## SHAP Global Explanation (Summary Plot Only)

In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test_df)
shap.summary_plot(shap_values[1], X_test_df.iloc[:200])

## Notes
- SHAP summary plot shows the most influential features globally.
- Force plot is skipped to avoid compatibility and performance issues in local environments.