In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from datetime import datetime
from math import radians, sin, cos, sqrt, atan2
import xgboost as xgb

#=====================
# Helper Functions
#=====================
def calculate_age(dob_str, transaction_date):
    dob = datetime.strptime(dob_str, '%Y-%m-%d')
    return (transaction_date - dob).days // 365

def haversine(lat1, lon1, lat2, lon2):
    R = 6371.0  # radius of Earth in km
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1)*cos(lat2)*sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    return R * c

#=====================
# Load Data
#=====================
data = pd.read_csv('/content/drive/MyDrive/cs506/train.csv')

# Convert dates
data['trans_date'] = pd.to_datetime(data['trans_date'])

# Extract features
data['trans_hour'] = data['trans_time'].str.split(':').str[0].astype(int)
data['trans_day'] = data['trans_date'].dt.day
data['trans_month'] = data['trans_date'].dt.month

# Calculate age
data['age'] = data.apply(lambda x: calculate_age(x['dob'], x['trans_date']), axis=1)

# Calculate distance
data['distance'] = data.apply(lambda x: haversine(x['lat'], x['long'], x['merch_lat'], x['merch_long']), axis=1)

# Encode category
category_dummies = pd.get_dummies(data['category'], prefix='cat')
data = pd.concat([data, category_dummies], axis=1)

# Encode gender
data['gender'] = data['gender'].map({'M':0, 'F':1}).fillna(-1)

# Features and Target
cat_features = [col for col in data.columns if col.startswith('cat_')]
feature_cols = ['amt', 'trans_hour', 'trans_day', 'trans_month', 'distance', 'age', 'city_pop', 'gender'] + cat_features
target = 'is_fraud'

X = data[feature_cols]
y = data[target]

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

#=====================
# Final Model using Best Parameters
#=====================
best_params = {
    'n_estimators': 200,
    'max_depth': 5,
    'learning_rate': 0.25,
    'subsample': 1.0,
    'colsample_bytree': 1.0,
    'min_child_weight': 1,
    'gamma': 0.1,
    'reg_alpha': 0.1,
    'reg_lambda': 1.5,
    'scale_pos_weight': 1
}

final_xgb = xgb.XGBClassifier(
    eval_metric='logloss',
    random_state=42,
    **best_params
)

final_xgb.fit(X_train, y_train)

# Evaluate on validation set
y_pred_val = final_xgb.predict(X_val)
f1_final = f1_score(y_val, y_pred_val)
print("Final Model F1-score:", f"{f1_final:.4f}")

# After tuning and finalizing the model, apply the same transformations to test data
# For demonstration assume we have test data as test.csv
test_data = pd.read_csv('/content/drive/MyDrive/cs506/test.csv')

# Repeat preprocessing on test data
test_data['trans_date'] = pd.to_datetime(test_data['trans_date'])
test_data['trans_hour'] = test_data['trans_time'].str.split(':').str[0].astype(int)
test_data['trans_day'] = test_data['trans_date'].dt.day
test_data['trans_month'] = test_data['trans_date'].dt.month
test_data['age'] = test_data.apply(lambda x: calculate_age(x['dob'], x['trans_date']), axis=1)
test_data['distance'] = test_data.apply(lambda x: haversine(x['lat'], x['long'], x['merch_lat'], x['merch_long']), axis=1)
test_data['gender'] = test_data['gender'].map({'M':0, 'F':1}).fillna(-1)

# One-hot encode category in test set using the same categories from training
test_cat_dummies = pd.get_dummies(test_data['category'], prefix='cat')

# Align columns with training set
for col in cat_features:
    if col not in test_cat_dummies:
        test_cat_dummies[col] = 0

test_data = pd.concat([test_data, test_cat_dummies], axis=1)

# Ensure test_data has the same feature columns as training
missing_cols = set(feature_cols) - set(test_data.columns)
for c in missing_cols:
    test_data[c] = 0

test_X = test_data[feature_cols]

# Predict on test set
test_preds = final_xgb.predict(test_X)

# Prepare submission
submission = pd.DataFrame({
    'id': test_data['id'],
    'is_fraud': test_preds
})



Final Model F1-score: 0.9796


In [39]:
submission.to_csv('/content/drive/MyDrive/cs506/submission.csv', index=False)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
