In [None]:
from datetime import datetime
import pandas as pd
import os
import joblib
from io import BytesIO
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression

In [None]:
import boto3

In [None]:
df_spark = spark.sql("SELECT * FROM fraud_miner.silver.fraud_geo_view")
df = df_spark.toPandas()

In [None]:
# Feature engineering
df["geo_matches_merchant"] = (df["geo_country"] == df["Merchant_Country"]).astype(int)
target = "Transaction_Fraud"
features = [
    "Transaction_Amount",
    "Card_Provider",
    "Merchant_Category",
    "Merchant_Country",
    "geo_country",
    "geo_matches_merchant"
]
X = df[features]
y = df[target]

In [None]:
# Preprocessing
numeric_features = ["Transaction_Amount", "geo_matches_merchant"]
categorical_features = list(set(features) - set(numeric_features))

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)

In [None]:
preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

In [None]:
# K-Fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y, cv=kf, scoring='accuracy')
results_df = pd.DataFrame({
    "run_date": [datetime.now()] * len(scores),
    "fold": list(range(1, len(scores) + 1)),
    "accuracy": scores
})

In [None]:
# Train/test split and fit model
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
preds_df = pd.DataFrame({
    "run_date": [datetime.now()] * len(y_test),
    "true_label": y_test.tolist(),
    "predicted_label": y_pred.tolist()
})

In [None]:
# Save model to DBFS temporary folder
os.makedirs("/tmp/fraud_models", exist_ok=True)
model_path = "/tmp/fraud_models/logreg_model.pkl"
joblib.dump(pipeline, model_path)

In [None]:
# Upload to S3
bucket = "fraud-miner"
s3_key = "model/logreg_model.pkl"
s3 = boto3.client("s3")
with open(model_path, "rb") as f:
    s3.upload_fileobj(f, bucket, s3_key)
print(f"✅ Model uploaded to s3://{bucket}/{s3_key}")

In [None]:
# Save to Databricks tables
results_df_spark = spark.createDataFrame(results_df)
preds_df_spark = spark.createDataFrame(preds_df)

spark.sql("CREATE SCHEMA IF NOT EXISTS fraud_miner.gold")

results_df_spark.write.mode("append").saveAsTable("fraud_miner.gold.model_evaluation")
preds_df_spark.write.mode("append").saveAsTable("fraud_miner.gold.model_predictions")

print("✅ Results written to Databricks tables")
