In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import shap
import lime
import lime.lime_tabular

In [13]:
# Load the datasets
fraud_data = pd.read_csv('../data/Fraud_Data.csv')
ip_data = pd.read_csv('../data/IpAddress_to_Country.csv')
credit_card_data = pd.read_csv('../data/creditcard.csv')

In [14]:
fraud_data

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,7.327584e+08,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,3.503114e+08,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2.621474e+09,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3.840542e+09,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,4.155831e+08,0
...,...,...,...,...,...,...,...,...,...,...,...
151107,345170,2015-01-27 03:03:34,2015-03-29 00:30:47,43,XPSKTWGPWINLR,SEO,Chrome,M,28,3.451155e+09,1
151108,274471,2015-05-15 17:43:29,2015-05-26 12:24:39,35,LYSFABUCPCGBA,SEO,Safari,M,32,2.439047e+09,0
151109,368416,2015-03-03 23:07:31,2015-05-20 07:07:47,40,MEQHCSJUBRBFE,SEO,IE,F,26,2.748471e+09,0
151110,207709,2015-07-09 20:06:07,2015-09-07 09:34:46,46,CMCXFGRHYSTVJ,SEO,Chrome,M,37,3.601175e+09,0


In [15]:
# Convert datetime strings to datetime objects
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])


In [16]:

# Extract useful datetime components
fraud_data['signup_hour'] = fraud_data['signup_time'].dt.hour
fraud_data['signup_day'] = fraud_data['signup_time'].dt.dayofweek
fraud_data['purchase_hour'] = fraud_data['purchase_time'].dt.hour
fraud_data['purchase_day'] = fraud_data['purchase_time'].dt.dayofweek

# Drop the original datetime columns
fraud_data = fraud_data.drop(columns=['signup_time', 'purchase_time'])

In [17]:
# Preprocess Fraud_Data.csv
X_fraud = fraud_data.drop(columns=['class'])
y_fraud = fraud_data['class']


In [18]:
# Preprocess creditcard.csv
X_cc = credit_card_data.drop(columns=['Class'])
y_cc = credit_card_data['Class']

In [19]:
# Train-test split
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud)
X_train_cc, X_test_cc, y_train_cc, y_test_cc = train_test_split(X_cc, y_cc, test_size=0.2, random_state=42, stratify=y_cc)


In [20]:
# Convert datetime columns if any
if 'TransactionDate' in X_train_fraud.columns:  # Replace with your datetime column
    X_train_fraud['TransactionDate'] = pd.to_datetime(X_train_fraud['TransactionDate'])
    X_train_fraud['TransactionYear'] = X_train_fraud['TransactionDate'].dt.year
    X_train_fraud['TransactionMonth'] = X_train_fraud['TransactionDate'].dt.month
    X_train_fraud['TransactionDay'] = X_train_fraud['TransactionDate'].dt.day
    X_train_fraud['TransactionHour'] = X_train_fraud['TransactionDate'].dt.hour
    # Drop the original datetime column
    X_train_fraud = X_train_fraud.drop('TransactionDate', axis=1)

# Repeat for X_test_fraud if needed

# Select only numeric columns for scaling
numeric_cols_fraud = X_train_fraud.select_dtypes(include=['float64', 'int64']).columns
numeric_cols_cc = X_train_cc.select_dtypes(include=['float64', 'int64']).columns


In [21]:

# Feature scaling (standardization)
scaler = StandardScaler()

X_train_fraud_scaled = scaler.fit_transform(X_train_fraud[numeric_cols_fraud])
X_test_fraud_scaled = scaler.transform(X_test_fraud[numeric_cols_fraud])

X_train_cc_scaled = scaler.fit_transform(X_train_cc[numeric_cols_cc])
X_test_cc_scaled = scaler.transform(X_test_cc[numeric_cols_cc])


In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
import mlflow
import mlflow.sklearn

In [33]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(),
    'MLP': MLPClassifier(max_iter=500)
}

In [34]:
print(X_train_fraud.dtypes)

user_id             int64
purchase_value      int64
device_id           int32
source             object
browser            object
sex                object
age                 int64
ip_address        float64
signup_hour         int32
signup_day          int32
purchase_hour       int32
purchase_day        int32
dtype: object


In [35]:
from sklearn.preprocessing import LabelEncoder

# Label encode 'sex' column
label_encoder = LabelEncoder()
X_train_fraud['sex'] = label_encoder.fit_transform(X_train_fraud['sex'])
X_test_fraud['sex'] = label_encoder.transform(X_test_fraud['sex'])

In [36]:
# Frequency encoding for 'device_id', 'source', and 'browser'
for col in ['device_id', 'source', 'browser']:
    freq_encoding = X_train_fraud[col].value_counts(normalize=True)
    X_train_fraud[col] = X_train_fraud[col].map(freq_encoding)
    X_test_fraud[col] = X_test_fraud[col].map(freq_encoding)


In [37]:
from category_encoders import TargetEncoder

# Target encode 'device_id', 'source', and 'browser'
target_encoder = TargetEncoder(cols=['device_id', 'source', 'browser'])
X_train_fraud = target_encoder.fit_transform(X_train_fraud, y_train_fraud)
X_test_fraud = target_encoder.transform(X_test_fraud)


In [38]:
# Encode only the top N frequent categories and group others
N = 10  # Choose the number of top categories to keep

for col in ['device_id', 'source', 'browser']:
    top_categories = X_train_fraud[col].value_counts().nlargest(N).index
    X_train_fraud[col] = X_train_fraud[col].where(X_train_fraud[col].isin(top_categories), other='Other')
    X_test_fraud[col] = X_test_fraud[col].where(X_test_fraud[col].isin(top_categories), other='Other')

# Then, apply one-hot encoding after reducing categories
X_train_fraud = pd.get_dummies(X_train_fraud, columns=['device_id', 'source', 'browser'], drop_first=True)
X_test_fraud = pd.get_dummies(X_test_fraud, columns=['device_id', 'source', 'browser'], drop_first=True)



In [39]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, LSTM, Dropout, Flatten


In [40]:
# Convolutional Neural Network (CNN)
def build_cnn_model(input_shape):
    model = Sequential()
    model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=input_shape))
    model.add(Flatten())  # Flatten the output from Conv1D before feeding into Dense layers
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # For binary classification
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [46]:
# Ensure y_train_fraud and y_test_fraud are NumPy arrays with the correct dtype
y_train_fraud = np.array(y_train_fraud, dtype='float32')
y_test_fraud = np.array(y_test_fraud, dtype='float32')

# Convert the DataFrame to a NumPy array and reshape it for CNN input
X_train_fraud_reshaped = X_train_fraud.values.reshape(-1, X_train_fraud.shape[1], 1).astype('float32')
X_test_fraud_reshaped = X_test_fraud.values.reshape(-1, X_test_fraud.shape[1], 1).astype('float32')

# Ensure that X_train_fraud and X_test_fraud are NumPy arrays of type float32
X_train_fraud = X_train_fraud.values.astype('float32')
X_test_fraud = X_test_fraud.values.astype('float32')

# Reshape if required by the model
X_train_fraud = X_train_fraud.reshape(-1, X_train_fraud.shape[1], 1)
X_test_fraud = X_test_fraud.reshape(-1, X_test_fraud.shape[1], 1)

# Build and train the CNN model
cnn_model = build_cnn_model((X_train_fraud.shape[1], 1))
# cnn_model.fit(X_train_fraud_reshaped, y_train_fraud, epochs=10, batch_size=64)

In [43]:
import shap


In [58]:
# Use K-means clustering to summarize background data to 100 clusters
# Reshape X_train_fraud from (samples, width, height) to (samples, width * height)
input_shape = cnn_model.input_shape  # e.g., (None, height, width, channels) or (None, height, width)

# Adjust for cases with or without channels
if len(input_shape) == 3:
    _, height, width = input_shape
    channels = 1  # Assume grayscale if channels are missing
elif len(input_shape) == 4:
    _, height, width, channels = input_shape
else:
    raise ValueError("Unexpected input shape: ensure it is (None, height, width) or (None, height, width, channels)")

# Reshape X_train_fraud based on determined height, width, and channels
X_train_fraud_reshaped = X_train_fraud.reshape(-1, height * width * channels)

# Use kmeans with reshaped data
# Create background for SHAP
background = shap.kmeans(X_train_fraud_reshaped, 100)


In [65]:
print("X_test_fraud shape:", X_test_fraud.shape)


X_test_fraud shape: (30223, 15, 1)


In [74]:
print("X_test_fraud_reshaped shape:", X_test_fraud_reshaped.shape)
print("Background shape:", background.shape)


X_test_fraud_reshaped shape: (30223, 15, 1, 1)
Background shape: (100, 25, 1)


In [80]:
# For fraud_data
fraud_X = fraud_data.drop(columns=['class'])
fraud_y = fraud_data['class']

# Split fraud_data
fraud_X_train, fraud_X_test, fraud_y_train, fraud_y_test = train_test_split(fraud_X, fraud_y, test_size=0.3, random_state=42)


In [81]:
# Define preprocessing for numerical and categorical features
numeric_features = ['purchase_value', 'age']  # Example numeric features
categorical_features = ['source', 'browser', 'sex', 'signup_hour', 'signup_day', 'purchase_hour', 'purchase_day']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])


In [82]:
# Use the best model (Random Forest in this case) and pipeline for fraud data
rf_pipeline_fraud = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', RandomForestClassifier())])
rf_pipeline_fraud.fit(fraud_X_train, fraud_y_train)

In [83]:
import shap
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

In [84]:
# Preprocess the test data
fraud_X_test_preprocessed = rf_pipeline_fraud.named_steps['preprocessor'].transform(fraud_X_test)


In [85]:
# Check if the preprocessed data is a sparse matrix and convert it to a dense format
if isinstance(fraud_X_test_preprocessed, csr_matrix):
    fraud_X_test_preprocessed = fraud_X_test_preprocessed.toarray()

In [86]:
# Convert to a DataFrame for checking data types
fraud_X_test_preprocessed_df = pd.DataFrame(fraud_X_test_preprocessed)
print(fraud_X_test_preprocessed_df.dtypes)

0     float64
1     float64
2     float64
3     float64
4     float64
       ...   
69    float64
70    float64
71    float64
72    float64
73    float64
Length: 74, dtype: object


In [87]:
# Ensure all data is of type float64
fraud_X_test_preprocessed = fraud_X_test_preprocessed_df.values.astype(np.float64)


In [88]:
# SHAP explainability
explainer_fraud = shap.Explainer(rf_pipeline_fraud.named_steps['classifier'])


In [None]:
# Generate SHAP values
shap_values_fraud = explainer_fraud(fraud_X_test_preprocessed)