# EDA notebook

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from pandas.plotting import scatter_matrix
from modules.data.raw_data_handler import RawDataHandler
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt


ModuleNotFoundError: No module named 'ace_tools'

In [None]:
# Initialize logging
log_messages = []

def log(msg):
    """Logs the message to the console and stores it in log_messages."""
    print(msg)
    log_messages.append(msg)

In [None]:
import os

# Define the correct dataset path
dataset_dir = os.path.join("storage", "datasets", "dataset_stratified_SMOTE_50_20250308073917.csv")

# Attempt to load the dataset using the corrected path
try:
    cleaned_data = pd.read_csv(dataset_dir)
    log("Dataset loaded successfully using the corrected path.")
except FileNotFoundError:
    log(f"Error: File not found at {dataset_dir}. Please verify the file location.")
    raise




In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt




# Ensure required columns exist
required_columns = {'unix_time', 'amt', 'category', 'hour', 'fraud_label'}
missing_columns = required_columns - set(cleaned_data.columns)
if missing_columns:
    log(f"Error: Missing required columns: {missing_columns}")
    raise ValueError(f"Missing required columns: {missing_columns}")

log(f"Dataset shape: {cleaned_data.shape}")
log(f"Columns present: {list(cleaned_data.columns)}")

# Convert unix_time to datetime
cleaned_data['datetime'] = pd.to_datetime(cleaned_data['unix_time'], unit='s')
log("Converted unix_time to datetime.")

# Compute Z-score for transaction amount
cleaned_data['z_score_transaction_amount'] = stats.zscore(cleaned_data['amt'])
log("Computed Z-score for transaction amount.")

# Fixing transaction count computation using searchsorted for proper rolling window calculations
log("Fixing transaction count computations...")

# Ensure data is sorted by datetime
cleaned_data = cleaned_data.sort_values(by='datetime')

# Convert datetime to numeric timestamp for efficient search
timestamps = cleaned_data['datetime'].astype(np.int64) // 10**9  # Convert to seconds

# Function to compute transactions in last X seconds
def count_past_transactions(time_series, window_seconds):
    indices = np.searchsorted(time_series, time_series - window_seconds, side='left')
    return np.arange(len(time_series)) - indices

# Compute transaction frequencies
cleaned_data['transactions_in_last_1hr'] = count_past_transactions(timestamps, 3600)
cleaned_data['transactions_in_last_24hrs'] = count_past_transactions(timestamps, 86400)
cleaned_data['transactions_in_last_7_days'] = count_past_transactions(timestamps, 604800)

log("Recomputed transactions in the last 1 hour, 24 hours, and 7 days.")

# Display dataset with engineered features
import ace_tools as tools
tools.display_dataframe_to_user(name="Updated Fraud Detection Features", dataframe=cleaned_data)

# Visualize how each feature isolates fraud transactions
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
features = [
    'z_score_transaction_amount', 'transactions_in_last_1hr', 'transactions_in_last_24hrs',
    'transactions_in_last_7_days', 'uncommon_category', 'is_night_transaction',
    'avg_time_between_transactions_last_hour', 'kl_divergence_spending_patterns'
]

for ax, feature in zip(axes.flatten(), features):
    ax.hist(cleaned_data[cleaned_data['fraud_label'] == 0][feature], bins=30, alpha=0.5, label='Legit', density=True)
    ax.hist(cleaned_data[cleaned_data['fraud_label'] == 1][feature], bins=30, alpha=0.5, label='Fraud', density=True)
    ax.set_title(feature)
    ax.legend()

plt.tight_layout()
plt.show()

log("Feature distributions successfully plotted.")


# Flag uncommon categories for users
category_counts = cleaned_data.groupby('category')['category'].transform('count')
cleaned_data['uncommon_category'] = (category_counts < category_counts.quantile(0.25)).astype(int)
log("Flagged uncommon categories for users.")

# Flag night transactions
cleaned_data['is_night_transaction'] = ((cleaned_data['hour'] >= 0) & (cleaned_data['hour'] < 6)).astype(int)
log("Flagged night transactions.")

# Compute average time between transactions in last hour
cleaned_data['avg_time_between_transactions_last_hour'] = cleaned_data.groupby('category')['datetime'].transform(
    lambda x: x.diff().dt.total_seconds().rolling(60).mean()
)
log("Computed average time between transactions in the last hour.")

# Compute KL Divergence on Spending Patterns
def compute_kl_divergence(data):
    past_amounts = data.shift(1).dropna()
    if past_amounts.empty:
        return np.nan
    recent_dist = np.histogram(data, bins=10, density=True)[0] + 1e-9  # Avoid zero probabilities
    past_dist = np.histogram(past_amounts, bins=10, density=True)[0] + 1e-9
    return stats.entropy(recent_dist, past_dist)

cleaned_data['kl_divergence_spending_patterns'] = cleaned_data.groupby('category')['amt'].transform(compute_kl_divergence)
log("Computed KL divergence on spending patterns.")

# Display dataset with engineered features
import ace_tools as tools
tools.display_dataframe_to_user(name="Engineered Features for Fraud Detection", dataframe=cleaned_data)

# Visualize how each feature isolates fraud transactions
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
features = [
    'z_score_transaction_amount', 'transactions_in_last_1hr', 'transactions_in_last_24hrs',
    'transactions_in_last_7_days', 'uncommon_category', 'is_night_transaction',
    'avg_time_between_transactions_last_hour', 'kl_divergence_spending_patterns'
]

for ax, feature in zip(axes.flatten(), features):
    ax.hist(cleaned_data[cleaned_data['fraud_label'] == 0][feature], bins=30, alpha=0.5, label='Legit', density=True)
    ax.hist(cleaned_data[cleaned_data['fraud_label'] == 1][feature], bins=30, alpha=0.5, label='Fraud', density=True)
    ax.set_title(feature)
    ax.legend()

plt.tight_layout()
plt.show()

# Log visualization success
log("Feature distributions plotted successfully.")


In [None]:

def save_log(data, logtype = "default"):
    """Log request and response data in a TXT file with a timestamp filename."""
    log_type = log_type
    timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    log_dir = os.path.join("logs")
    os.makedirs(log_dir, exist_ok=True)
    log_filename = os.path.join(log_dir, f"log_{log_type}_{timestamp}.txt")

    # If data is a dict and might contain non-string items (like DataFrames), 
    # we convert it to a string representation. Using json.dumps with indent 
    # can produce a nice formatted string.
    # The default=str argument will call str() on any non-serializable objects.
    log_str = json.dumps(data, indent=4, default=str)

    with open(log_filename, "w") as f:
        f.write(log_str)

save_log(log_messages, log_type= "feature_engineering")

In [None]:

def verify_smote_distribution(df):
    """
    logs the count and proportion of each fraud_label value in the DataFrame.
    """
    counts = df['fraud_label'].value_counts()
    total = counts.sum()

    log("Distribution of fraud labels after SMOTE:")
    for label, count in counts.items():
        log(f"  fraud_label={label}: {count} rows ({count/total:.2%})")
    log("Use  /create_dataset?partition_type=stratified&minority_percent=75 to change the SMOTE distribution")
# Example usage (assuming your SMOTE-processed DataFrame is called sample_data):
verify_smote_distribution(cleaned_data)
# --------------------------
# Feature Engineering
# --------------------------

# 1. Create binary encoding for weekdays vs. weekends.
if "day_of_week" in cleaned_data.columns:
    # Check the type of day_of_week; if numeric, assume 0=Monday ... 6=Sunday.
    if pd.api.types.is_numeric_dtype(cleaned_data["day_of_week"]):
        cleaned_data["is_weekend"] = cleaned_data["day_of_week"].apply(lambda x: 1 if x in [5, 6] else 0)
    else:
        # Assuming the column contains day names as strings.
        cleaned_data["is_weekend"] = cleaned_data["day_of_week"].apply(lambda x: 1 if x in ["Saturday", "Sunday"] else 0)
    log("Engineered 'is_weekend' feature based on 'day_of_week'.")
else:
    log("'day_of_week' column not found; cannot engineer 'is_weekend' feature.")

# 2. Discretize the 'hour' feature into 4-hour bins.
if "hour" in cleaned_data.columns:
    bins = [0, 6, 12, 18, 24]
    labels = [f"{bins[i]}-{bins[i+1]-1}" for i in range(len(bins)-1)]
    cleaned_data["hour_bin"] = pd.cut(cleaned_data["hour"], bins=bins, right=False, labels=labels, include_lowest=True)
    cleaned_data["hour_bin"] = cleaned_data["hour_bin"].cat.codes  # Numeric codes for correlation analysis
    log("Engineered 'hour_bin' feature from 'hour'.")
else:
    log("'hour' column not found; cannot engineer 'hour_bin' feature.")

# 3. Drop fraud records where city_pop is greater than 50,000
if "city_pop" in cleaned_data.columns:
    before_drop = cleaned_data.shape[0]
    cleaned_data = cleaned_data[~((cleaned_data["fraud_label"] == 1) & (cleaned_data["city_pop"] > 50000))]
    after_drop = cleaned_data.shape[0]
    log(f"Dropped {before_drop - after_drop} fraud records with city_pop > 50000.")
else:
    log("Column 'city_pop' not found in the dataset.")

# 4. Drop rows with amt > 1500
if "amt" in cleaned_data.columns:
    before_drop = cleaned_data.shape[0]
    cleaned_data = cleaned_data[cleaned_data["amt"] <= 1500]
    after_drop = cleaned_data.shape[0]
    log(f"\nDropped {before_drop - after_drop} rows where 'amt' > 1500.")
else:
    log("\nColumn 'amt' not found in the dataset. No rows dropped based on 'amt'.")

# 5. log transform amt columnm, adding log_amt column
if "amt" in cleaned_data.columns:
    cleaned_data["log_amt"] = np.log1p(cleaned_data["amt"])
    log("Created 'log_amt' feature using logarithmic transformation.")
else:
    log("'amt' column not found in the dataset.")


# log the remaining columns 
log("Remaining columns in the dataset before trans_date_trans_time creates <5 min feature:")
log(list(cleaned_data.columns))




# Define tolerance and thresholds for log_amt_special
tolerance = 0.1
thresholds = [3.0, 5.6, 5.8, 6.6, 6.8]

def is_near_threshold(x, thresholds, tol):
    return any(abs(x - t) < tol for t in thresholds)

# 1. Create a feature for log(amt) being near specified thresholds
if "log_amt" in cleaned_data.columns:
    cleaned_data["log_amt_special"] = cleaned_data["log_amt"].apply(lambda x: 1 if is_near_threshold(x, thresholds, tolerance) else 0)
    log("Created 'log_amt_special' feature indicating if log(amt) is near one of the thresholds:")
    log(thresholds)
else:
    log("'log_amt' column not found; cannot create 'log_amt_special' feature.")



# 2. Create a feature for when merchant transaction frequency is between 1500 and 2200
if "merchant_txn_count" in cleaned_data.columns:
    cleaned_data["merchant_txn_range"] = ((cleaned_data["merchant_txn_count"] >= 1500) & (cleaned_data["merchant_txn_count"] <= 2200)).astype(int)
    log("Created 'merchant_txn_range' feature indicating if merchant_txn_count is between 1500 and 2200.")
else:
    log("'merchant_txn_count' column not found; cannot create 'merchant_txn_range' feature.")

# 3. create a feature of the top 4 categories
# Specify the number of top categories to consider as "popular"
num_top = 4

if "category" in cleaned_data.columns and "fraud_label" in cleaned_data.columns:
    # Filter only the fraud cases
    fraud_data = cleaned_data[cleaned_data["fraud_label"] == 1]
    
    # Count the frequency of each category in fraud cases
    top_categories = fraud_data["category"].value_counts().head(num_top).index.tolist()
    
    # Create a new binary feature that is 1 if the record's category is one of the top fraud categories
    cleaned_data["popular_category_fraud"] = cleaned_data["category"].apply(lambda x: 1 if x in top_categories else 0)
    
    print(f"Created 'popular_category_fraud' feature using top {num_top} fraud categories:", top_categories)
else:
    print("Required columns ('category', 'fraud_label') not found in the dataset.")




# log the remaining columns after dropping
log("Remaining columns in the dataset:")
log(list(cleaned_data.columns))

# --------------------------
# Plotting Engineered Features by Fraud Label
# --------------------------
import seaborn as sns
import matplotlib.pyplot as plt






# Plot for log_amt
if "log_amt" in cleaned_data.columns:
    plt.figure(figsize=(8, 6))
    sns.histplot(data=cleaned_data, x="log_amt", hue="fraud_label", bins=30, multiple="stack")
    plt.title("Distribution of Log(amt) by Fraud Label")
    plt.xlabel("Log(amt)")
    plt.ylabel("Count")
    plt.show()




import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns





# Filter the dataset for fraudulent transactions (fraud_label = 1)
fraud_data = cleaned_data[cleaned_data["fraud_label"] == 1]

# Plot for log_amt
if "log_amt" in fraud_data.columns:
    plt.figure(figsize=(8, 6))
    sns.histplot(data=fraud_data, x="log_amt", bins=30)
    plt.title("Distribution of Log(amt) for Fraudulent Transactions")
    plt.xlabel("Log(amt)")
    plt.ylabel("Count")
    plt.show()










# --------------------------
# Visualizations for Custom Features by Fraud Label
# --------------------------
import seaborn as sns
import matplotlib.pyplot as plt

# Plot for log_amt_special
if "log_amt_special" in cleaned_data.columns:
    plt.figure(figsize=(8, 6))
    sns.countplot(x="log_amt_special", hue="fraud_label", data=cleaned_data)
    plt.title("Count of log_amt_special Feature by Fraud Label")
    plt.xlabel("log_amt_special (0: not near threshold, 1: near threshold)")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.show()











In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

# Separate features and target from the cleaned_data
X = cleaned_data.drop("fraud_label", axis=1)
y = cleaned_data["fraud_label"]

# Identify non-numeric columns in X
non_numeric_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
log("Non-numeric columns in X:")
log(non_numeric_cols)

# Select only numeric features for model fitting
X_numeric = X.select_dtypes(include=[np.number])
log("Using numeric features:")
log(X_numeric.columns.tolist())


from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

# Step 1: Train a Random Forest and get feature importances
#rf = RandomForestClassifier(n_estimators=100, random_state=42)
#rf.fit(X_numeric, y.loc[X_numeric.index])
#importances = pd.Series(rf.feature_importances_, index=X_numeric.columns)
#log("Random Forest Feature Importances:")
#log(importances.sort_values(ascending=False))

# Step 2: Use RFE with a Random Forest estimator to select features
#rfe = RFE(estimator=RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=10)
#rfe.fit(X_numeric, y.loc[X_numeric.index])
#selected_features = X_numeric.columns[rfe.support_]
#log("Selected features via RFE:")
#log(selected_features)




# --------------------------
# Additional Feature Engineering for Selection Methods
# --------------------------
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# Find and log columns that have non-numeric values
non_numeric_cols = cleaned_data.select_dtypes(exclude=[np.number]).columns.tolist()
log("Columns with non-numeric values:")
log(non_numeric_cols)


# Separate features and target from the full dataset (for selection purposes)
X = cleaned_data.drop("fraud_label", axis=1)
y = cleaned_data["fraud_label"]

# Select only numeric features to avoid type promotion issues (drop datetime and non-numeric columns)
X_numeric = X.select_dtypes(include=[np.number])
log("Numeric features used for mutual information:")
log(X_numeric.columns.tolist())

# Use mutual information to score features using only numeric features
selector = SelectKBest(score_func=mutual_info_classif, k='all')
selector.fit(X_numeric, y.loc[X_numeric.index])
mi_scores = pd.Series(selector.scores_, index=X_numeric.columns)
log("Mutual Information Scores (Numeric Columns Only):")
log(mi_scores.sort_values(ascending=False))
