In [None]:
# Import necessary libraries
!pip install xgboost
!pip install category_encoders
!pip install imblearn
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder, StandardScaler, PowerTransformer
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, train_test_split
from sklearn.metrics import f1_score, roc_auc_score, classification_report, precision_score, recall_score
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.pipeline import Pipeline

# Load the training and test datasets
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

# Feature Engineering
def haversine_distance(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    km = 6371 * c
    return km

for df in [train_df, test_df]:
    # Combine date and time columns
    df['trans_datetime'] = pd.to_datetime(df['trans_date'] + ' ' + df['trans_time'])

    # Calculate age from 'dob'
    df['dob'] = pd.to_datetime(df['dob'], errors='coerce')
    df['age'] = (df['trans_datetime'] - df['dob']).dt.days / 365.25

    # Extract transaction time features
    df['hour'] = df['trans_datetime'].dt.hour
    df['day'] = df['trans_datetime'].dt.day
    df['month'] = df['trans_datetime'].dt.month
    df['weekday'] = df['trans_datetime'].dt.weekday

    # Calculate distance using the Haversine formula
    df['distance'] = haversine_distance(df['lat'], df['long'], df['merch_lat'], df['merch_long'])

    # Sort transactions and calculate time since last transaction
    df.sort_values(by=['cc_num', 'unix_time'], inplace=True)
    df['time_since_last_txn'] = df.groupby('cc_num')['unix_time'].diff()

    # Velocity (distance/time)
    df['velocity'] = df['distance'] / (df['time_since_last_txn'] + 1)

    # Aggregated Transaction Statistics
    df['amt_mean_user'] = df.groupby('cc_num')['amt'].transform('mean')
    df['amt_std_user'] = df.groupby('cc_num')['amt'].transform('std')
    df['amt_ratio'] = df['amt'] / (df['amt_mean_user'] + 1)

    # Merchant-Cardholder Interaction Features
    df['merchant_cardholder_freq'] = df.groupby(['cc_num', 'merchant'])['id'].transform('count')
    df['merchant_cardholder_avg_amt'] = df.groupby(['cc_num', 'merchant'])['amt'].transform('mean')

# Encode categorical variables
gender_map = {'F': 0, 'M': 1}
for df in [train_df, test_df]:
    df['gender'] = df['gender'].map(gender_map)

# Target Encoding for categorical features
categorical_features = ['category', 'job', 'state']
encoder = TargetEncoder(cols=categorical_features)
encoder.fit(train_df[categorical_features], train_df['is_fraud'])
train_df[categorical_features] = encoder.transform(train_df[categorical_features])
test_df[categorical_features] = encoder.transform(test_df[categorical_features])

# Feature Transformation
for df in [train_df, test_df]:
    df['amt_log'] = np.log1p(df['amt'])
    df['distance_log'] = np.log1p(df['distance'])

# Additional Interaction Features
for df in [train_df, test_df]:
    df['hour_day_interaction'] = df['hour'] * df['day']
    df['amt_age_interaction'] = df['amt'] * df['age']
    df['amt_distance_interaction'] = df['amt'] * df['distance']

# Select updated features
features = [
    'amt', 'amt_mean_user', 'amt_std_user', 'amt_ratio', 'gender', 'category', 'age', 'city_pop',
    'job', 'hour', 'day', 'month', 'weekday', 'distance', 'state', 'amt_log', 'distance_log',
    'hour_day_interaction', 'amt_age_interaction', 'amt_distance_interaction',
    'merchant_cardholder_freq', 'merchant_cardholder_avg_amt', 'time_since_last_txn', 'velocity'
]

X = train_df[features]
y = train_df['is_fraud']
X_test = test_df[features]

# Handle missing values
imputer = SimpleImputer(strategy='median')
X = pd.DataFrame(imputer.fit_transform(X), columns=features)
X_test = pd.DataFrame(imputer.transform(X_test), columns=features)

# Normalize data using PowerTransformer
scaler = PowerTransformer()
X = pd.DataFrame(scaler.fit_transform(X), columns=features)
X_test = pd.DataFrame(scaler.transform(X_test), columns=features)

# Address class imbalance
smote = SMOTE(random_state=42, k_neighbors=5)
undersampler = RandomUnderSampler(random_state=42, sampling_strategy='auto')
X, y = smote.fit_resample(X, y)
X, y = undersampler.fit_resample(X, y)

# Split the training data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Hyperparameter tuning for XGBoost
param_dist = {
    'n_estimators': [100, 200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 10],
    'subsample': [0.6, 0.8, 1],
    'colsample_bytree': [0.6, 0.8, 1],
    'gamma': [0, 0.1, 0.3, 0.5],
    'reg_alpha': [0, 0.1, 0.5, 1, 2],
    'reg_lambda': [1, 1.5, 2, 3, 5],
    'min_child_weight': [1, 3, 5],
    'scale_pos_weight': [1, 2, 5]
}

model = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

random_search = RandomizedSearchCV(
    model, param_distributions=param_dist, n_iter=200, scoring='f1',
    n_jobs=-1, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), verbose=2
)
random_search.fit(X_train, y_train)

# Evaluate the best model
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_val)
y_pred_proba = best_model.predict_proba(X_val)[:, 1]

print('Validation F1 Score:', f1_score(y_val, y_pred))
print('Validation ROC AUC:', roc_auc_score(y_val, y_pred_proba))
print('Classification Report:', classification_report(y_val, y_pred))
print('Precision:', precision_score(y_val, y_pred))
print('Recall:', recall_score(y_val, y_pred))

# Predictions on test set
test_preds = best_model.predict(X_test)
submission = pd.DataFrame({'id': test_df['id'], 'is_fraud': test_preds})
submission.to_csv('submission.csv', index=False)