In [9]:
%pip install scikit-learn pandas joblib

import pandas as pd
import sys
import os

# Add project root (parent of src) to path
project_root = os.path.abspath('..')
if project_root not in sys.path:
	sys.path.append(project_root)

src_path = os.path.join(project_root, 'src')

# Debug: print sys.path and check if data_processing.py exists
print("sys.path:", sys.path)
print("data_processing.py exists:", os.path.isfile(os.path.join(src_path, "data_processing.py")))

from src.data_processing import preprocess_transaction_data, save_processed_data

# Load raw data
raw_data_path = os.path.join(os.path.dirname(src_path), 'data', 'raw', 'data.csv')
df = pd.read_csv(raw_data_path)

# Process data
processed = preprocess_transaction_data(df)

# Save processed data
save_processed_data(pd.DataFrame(processed), '../data/processed/processed_data.csv')
import pandas as pd
from src.predict import create_proxy_target
from src.data_processing import preprocess_transaction_data

# Step 1: Use the already loaded and processed DataFrame 'df'

# Step 2: Create proxy high-risk labels
proxy_labels = create_proxy_target(df)

# Step 3: Preprocess features for modeling
X = preprocess_transaction_data(df)  # numpy array of features

# Step 4: Merge proxy label back to customer info
df_customer = df.drop_duplicates("CustomerId")[["CustomerId"]].copy()
df_customer = df_customer.merge(proxy_labels, on="CustomerId", how="left")

# Now df_customer['is_high_risk'] is your target variable
print(df_customer.head())
# Save the processed DataFrame with proxy labels
processed_data_path = '../data/processed/processed_data_with_labels.csv'
df_customer.to_csv(processed_data_path, index=False)
print(f"Processed data with proxy labels saved to {processed_data_path}")

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Create customer-level features to match df_customer
# Example: aggregate transaction features per customer
customer_features = df.groupby("CustomerId").agg({
	"Amount": ["mean", "sum", "count"],
	"Value": ["mean", "sum"],
	"FraudResult": ["sum", "mean"],
	# Add more aggregations as needed
})

# Flatten MultiIndex columns
customer_features.columns = ['_'.join(col) for col in customer_features.columns]
customer_features = customer_features.reset_index()

# Merge with df_customer to ensure alignment
df_customer_features = df_customer.merge(customer_features, on="CustomerId", how="left")

# Prepare X and y
X = df_customer_features.drop(columns=["CustomerId", "is_high_risk"]).values
y = df_customer_features["is_high_risk"].values

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a simple classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
# Save evaluation metrics (assuming you have a function in src/utils.py)
from src.utils import save_evaluation_metrics
print("Evaluation metrics saved to evaluation_metrics.txt")
	


Note: you may need to restart the kernel to use updated packages.
sys.path: ['c:\\Users\\Yohanan\\AppData\\Local\\Programs\\Python\\Python313\\python313.zip', 'c:\\Users\\Yohanan\\AppData\\Local\\Programs\\Python\\Python313\\DLLs', 'c:\\Users\\Yohanan\\AppData\\Local\\Programs\\Python\\Python313\\Lib', 'c:\\Users\\Yohanan\\AppData\\Local\\Programs\\Python\\Python313', '', 'C:\\Users\\Yohanan\\AppData\\Roaming\\Python\\Python313\\site-packages', 'c:\\Users\\Yohanan\\AppData\\Local\\Programs\\Python\\Python313\\Lib\\site-packages', 'c:\\Users\\Yohanan\\AppData\\Local\\Programs\\Python\\Python313\\Lib\\site-packages\\win32', 'c:\\Users\\Yohanan\\AppData\\Local\\Programs\\Python\\Python313\\Lib\\site-packages\\win32\\lib', 'c:\\Users\\Yohanan\\AppData\\Local\\Programs\\Python\\Python313\\Lib\\site-packages\\Pythonwin', 'c:\\Users\\Yohanan\\credit-risk-model']
data_processing.py exists: True
Processed data saved to ../data/processed/processed_data.csv
        CustomerId  is_high_risk
0  Cus