In [None]:
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from sklearn.model_selection import train_test_split

# --- Feature engineering in pandas ---
data = pd.read_csv("fraudTrain.csv")
data['dob'] = pd.to_datetime(data['dob'], errors='coerce')
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'], errors='coerce')
data['age'] = data['trans_date_trans_time'].dt.year - data['dob'].dt.year

def age_group(age):
    if age < 18:
        return 'Teen'
    elif age < 25:
        return 'Young Adult'
    elif age < 35:
        return 'Adult'
    elif age < 50:
        return 'Middle-aged'
    elif age < 65:
        return 'Senior'
    else:
        return 'Elderly'

data['age_group'] = data['age'].apply(age_group)
data['day'] = data['trans_date_trans_time'].dt.day
data['month'] = data['trans_date_trans_time'].dt.month
data['year'] = data['trans_date_trans_time'].dt.year
data['hour'] = data['trans_date_trans_time'].dt.hour
data['minute'] = data['trans_date_trans_time'].dt.minute

def haversine(lat1, lon1, lat2, lon2):
    R = 6371
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

data['distance_km'] = haversine(data['lat'], data['long'], data['merch_lat'], data['merch_long'])

def distance_category(dist):
    if dist < 0.5:
        return 'Very Near'
    elif dist < 3:
        return 'Near'
    elif dist < 15:
        return 'Moderate'
    elif dist < 50:
        return 'Far'
    else:
        return 'Very Far'

data['distance_group'] = data['distance_km'].apply(distance_category)

cols_to_drop = [
    'Unnamed: 0', 'first', 'last', 'street', 'city', 'state', 'zip',
    'dob', 'cc_num', 'trans_num', 'unix_time',
    'lat', 'long', 'merch_lat', 'merch_long', 'trans_date_trans_time',
]
data.drop(columns=cols_to_drop, inplace=True)

# --- Sample and split ---
sample_data = data.sample(frac=0.005, random_state=42)
X = sample_data.drop(columns=['is_fraud'])
y = sample_data['is_fraud']
X['label'] = y.values

X_train, X_test = train_test_split(X, test_size=0.15, random_state=42)

# --- Spark session ---
spark = SparkSession.builder.getOrCreate()
train_df = spark.createDataFrame(X_train)
test_df = spark.createDataFrame(X_test)

# --- Identify columns ---
cat_cols = [field for (field, dtype) in train_df.dtypes if dtype == "string"]
num_cols = [field for (field, dtype) in train_df.dtypes if dtype in ["double", "int"] and field != "label"]

# --- Pipeline stages ---
indexers = [StringIndexer(inputCol=col, outputCol=col+"_idx", handleInvalid="keep") for col in cat_cols]
assembler_inputs = [col+"_idx" for col in cat_cols] + num_cols
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features_vec")
scaler = StandardScaler(inputCol="features_vec", outputCol="features")
gbt = GBTClassifier(featuresCol="features", labelCol="label")

pipeline = Pipeline(stages=indexers + [assembler, scaler, gbt])

# --- Train pipeline ---
pipeline_model = pipeline.fit(train_df)

# --- Evaluate ---
predictions = pipeline_model.transform(test_df)
multi_evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')
recall_score_model = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='weightedRecall')
precision_score_model = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='weightedPrecision')

accuracy = multi_evaluator.evaluate(predictions)
recall = recall_score_model.evaluate(predictions)
precision = precision_score_model.evaluate(predictions)
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

# --- Save pipeline model ---
pipeline_model.save("pipeline_model")