# Libraries

In [82]:
import pandas as pd
from os.path import exists
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
from sklearn.ensemble import HistGradientBoostingClassifier 

# Loading the File

In [3]:
trainingSet = pd.read_csv('./data/train.csv')
testingSet = pd.read_csv('./data/test.csv')

print("train.csv shape is ", trainingSet.shape)
print("test.csv shape is ", testingSet.shape)

print()

print(trainingSet.head())
print()
print(testingSet.head())

print()

print(trainingSet.describe())

train.csv shape is  (370703, 24)
test.csv shape is  (92676, 23)

        id                         trans_num  trans_date trans_time  \
0   308467  26ad750c2ff71f32631b58913582d70a  2024-01-10   06:49:39   
1   261578  fea9c1efe3f2b97f27ad0ab5409ec861  2024-01-06   02:37:50   
2      341  2ae350b982be840f3666273e0c2f3a05  2024-01-18   21:40:21   
3  1147639  bbdd8adfc0a34ed0e817f809193c85c0  2024-01-21   16:20:15   
4   314152  fc7756004dc2a9bc450eb894a670b804  2024-01-21   19:36:26   

    unix_time        category     amt            cc_num    first     last  \
0  1704887379        misc_pos  188.38      676355457570   Andrea  Johnson   
1  1704526670     grocery_pos  102.63   377178373574671   Rhonda   Chavez   
2  1705632021   entertainment    1.62  3599292013370451  Stephen     Khan   
3  1705872015  health_fitness    5.64  3594292572430345   Justin   Reilly   
4  1705883786  health_fitness   97.09  4867547663675548    Alice   Duarte   

   ...    zip      lat      long city_pop    

# Adding Features

In [132]:
def add_features_to(df):
    # This is where you can do all your feature extraction

    # Label encode some of the categorical features
    encoder = LabelEncoder()
    
    df['category_num'] = encoder.fit_transform(df['category'])
    df['state_num'] = encoder.fit_transform(df['state'])
    df['merchant_num'] = encoder.fit_transform(df['merchant'])
    df['city_num'] = encoder.fit_transform(df['city'])
    df['job_num'] = encoder.fit_transform(df['job'])
    df['gender_num'] = encoder.fit_transform(df['gender'])
    
    # Calculate age of the person by using 'dob' column
    df['dob'] = pd.to_datetime(df['dob'])
    
    reference_date = datetime(2024, 12, 4) # pick a constant date for consistency in predictions
    df['age'] = df['dob'].apply(lambda x: reference_date.year - x.year - ((reference_date.month, reference_date.day) < (x.month, x.day)))

    
    # Convert trans_date to datetime and extract components
    df['trans_date'] = pd.to_datetime(df['trans_date'])
    df['year'] = df['trans_date'].dt.year
    df['month'] = df['trans_date'].dt.month
    df['day'] = df['trans_date'].dt.day
    
    # Convert trans_time to datetime and extract components
    df['trans_time'] = pd.to_datetime(df['trans_time'], format='%H:%M:%S').dt.time
    df['hour'] = df['trans_time'].apply(lambda x: x.hour)
    df['minute'] = df['trans_time'].apply(lambda x: x.minute)
    df['second'] = df['trans_time'].apply(lambda x: x.second)

    
    # # Count unique merchants per day for each cardholder
    # df['unique_merchants_per_day'] = df.groupby(['cc_num', df['trans_date'].dt.date])['merchant'].transform('nunique')
    # # Count unique transaction categories per day for each cardholder
    # df['unique_categories_per_day'] = df.groupby(['cc_num', df['trans_date'].dt.date])['category'].transform('nunique')

    # historical_data = df.groupby('cc_num')[['category', 'merchant', 'amt']].apply(lambda x: x.mode().iloc[0]).to_dict()

    # # Flag if the current transaction matches historical patterns
    # df['percent_historical_similarity'] = df.apply(
    #     lambda row: sum([
    #         row['category'] == historical_data.get(row['cc_num'], {}).get('category', ''),
    #         row['merchant'] == historical_data.get(row['cc_num'], {}).get('merchant', ''),
    #         abs(row['amt'] - historical_data.get(row['cc_num'], {}).get('amt', 0)) < 5  # Small amount deviation
    #     ]) / 3,
    #     axis=1
    # )

    # df['common_merchants_with_others'] = df.groupby(['merchant', 'trans_date'])['cc_num'].transform('count')

    # # Define a high-value threshold (e.g., 90th percentile)
    # high_value_threshold = df['amt'].quantile(0.90)
    # df['high_value'] = df['amt'] > high_value_threshold

    # # Count high-value transactions per day for each cardholder
    # df['high_value_transaction_count'] = df.groupby(['cc_num', df['trans_date'].dt.date])['amt'].transform(
    #     lambda x: (x > high_value_threshold).sum()
    # )
    df['is_urban'] = df['city_pop'] > 50000
    df['is_weekend'] = df['trans_date'].dt.dayofweek.apply(lambda x: 1 if x >= 5 else 0)
    
    return df


X_train = add_features_to(trainingSet)
X_submission = add_features_to(testingSet)

# The training set is where the score is not null
X_train =  X_train[X_train['is_fraud'].notnull()]

X_submission.to_csv("./data/X_submission.csv", index=False)
X_train.to_csv("./data/X_train.csv", index=False)

# Sample + Split into Training & Testing Set 

In [133]:
# Split training set into training and testing set
X_train, X_test, Y_train, Y_test = train_test_split(
    X_train.drop(columns=['is_fraud']),
    X_train['is_fraud'],
    test_size=1/4.0,
    random_state=0
)

# Features Selection

In [134]:
features = ['amt', 'category_num', 'unix_time', 'gender_num', 'age', 'hour', 'day', 'is_urban', 'is_weekend']

X_train_select = X_train[features]
X_test_select = X_test[features]
X_submission_select = X_submission[features]

# Model Creation

In [135]:
# Select a model
model = HistGradientBoostingClassifier(
    max_iter=2000,
    learning_rate=0.05,
    max_leaf_nodes=120,
    min_samples_leaf=80,
    random_state=42
)
# Fitting the model
model.fit(X_train_select, Y_train)

# Making predictions
predictions = model.predict(X_test_select)
print("F1 Score:", f1_score(Y_test, predictions))

F1 Score: 0.9788825757575758


# Saving The File

In [140]:
# Create the submission file
X_submission['is_fraud'] = model.predict(X_submission_select)
submission = X_submission[['id', 'is_fraud']]
submission.to_csv("./data/submission.csv", index=False)