**DATA LOADING**

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

try:
    df=pd.read_csv('../data/fraudTest.csv')
    print("Data loaded successfully!")
except FileNotFoundError:
    print("Error: 'fraudTest.csv' not found. Make sure the file is in the '/data' folder.")

print("\nFirst 5 rows of the dataset:")
print(df.head())

print("\nDataFrame Info:")
print(df.info())

print("\nClass Distribution:")
print(df['is_fraud'].value_counts(normalize=True) * 100)


Data loaded successfully!

First 5 rows of the dataset:
   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2020-06-21 12:14:25  2291163933867244   
1           1   2020-06-21 12:14:33  3573030041201292   
2           2   2020-06-21 12:14:53  3598215285024754   
3           3   2020-06-21 12:15:15  3591919803438423   
4           4   2020-06-21 12:15:17  3526826139003047   

                               merchant        category    amt   first  \
0                 fraud_Kirlin and Sons   personal_care   2.86    Jeff   
1                  fraud_Sporer-Keebler   personal_care  29.84  Joanne   
2  fraud_Swaniawski, Nitzsche and Welch  health_fitness  41.28  Ashley   
3                     fraud_Haley Group        misc_pos  60.05   Brian   
4                 fraud_Johnston-Casper          travel   3.19  Nathan   

       last gender                       street  ...      lat      long  \
0   Elliott      M            351 Darlene Green  ...  33.9659  -80.9355   
1  Wil

**FEATURE ENGINEERING AND DATA BALANCING**

In [24]:
from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# --- 1. Preprocessing and Feature Engineering ---
# Assuming 'df' is your initial DataFrame

# Clean up column names by stripping leading/trailing spaces
df.columns = df.columns.str.strip()

df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df.rename(columns={'amt':'Amount'}, inplace=True)
df['cc_num'] = df['cc_num'].astype(str)

# Sort for time-based calculations
df.sort_values(by=['cc_num', 'trans_date_trans_time'], inplace=True)

# Feature: Time since last transaction
df['time_since_last_transaction'] = df.groupby('cc_num')['trans_date_trans_time'].diff().dt.total_seconds() / 60
df['time_since_last_transaction'].fillna(0, inplace=True) # Fill NaNs for first transactions

# Feature: Rolling transaction frequency (Corrected Method)
# Temporarily set index for rolling calculation
df_indexed = df.set_index('trans_date_trans_time')
# Calculate rolling count and merge it back
rolling_freq = df_indexed.groupby('cc_num')['cc_num'].rolling('3H').count().rename('transaction_frequency_3hr')
df = df.merge(rolling_freq, left_on=['cc_num', 'trans_date_trans_time'], right_index=True, how='left')

# Categorical encoding
df = pd.get_dummies(df, columns=['gender', 'job'], drop_first=True)

print("DataFrame Info after Preprocessing:")
df.info()
print("\nFinal DataFrame Head:")
print(df.head())

# Now drop columns that are no longer needed
df.drop(columns=['trans_date_trans_time', 'merchant', 'cc_num'], inplace=True)

# Drop all non-numeric columns (object dtype) before modeling
X = df.drop('is_fraud', axis=1)
X = X.select_dtypes(include=[np.number, 'bool']) # Keep only numeric and boolean columns
y = df['is_fraud']

# --- 2. Data Splitting (BEFORE SMOTE) ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


# --- 3. Apply SMOTE only on the Training Data ---
print('\nOriginal training dataset shape %s' % Counter(y_train))

sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

print('Resampled training dataset shape %s' % Counter(y_train_res))

# Now you're ready to train your model on (X_train_res, y_train_res)
# And evaluate it on the untouched (X_test, y_test)

  rolling_freq = df_indexed.groupby('cc_num')['cc_num'].rolling('3H').count().rename('transaction_frequency_3hr')


DataFrame Info after Preprocessing:
<class 'pandas.core.frame.DataFrame'>
Index: 555767 entries, 617 to 555203
Columns: 501 entries, Unnamed: 0 to job_Writer
dtypes: bool(478), datetime64[ns](1), float64(7), int64(5), object(10)
memory usage: 355.1+ MB

Final DataFrame Head:
      Unnamed: 0 trans_date_trans_time           cc_num  \
617          617   2020-06-21 15:41:32  180011453250192   
923          923   2020-06-21 17:33:11  180011453250192   
1065        1065   2020-06-21 18:15:55  180011453250192   
1491        1491   2020-06-21 20:49:21  180011453250192   
1692        1692   2020-06-21 22:08:56  180011453250192   

                                merchant     category  Amount  first  last  \
617                      fraud_Brown Inc    kids_pets   42.32  Craig  Dunn   
923   fraud_McDermott, Osinski and Morar         home   60.11  Craig  Dunn   
1065                    fraud_Turner LLC       travel  549.72  Craig  Dunn   
1491              fraud_Jakubowski Group  food_dining   5

**TRAINING AND TESTING MODELS**

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix

# --- 1. Model Training ---
print("Starting model training...")

# Individual models
lr_model = LogisticRegression(solver='liblinear', random_state=42)
rf_model = RandomForestClassifier(random_state=42)

# Voting Classifier (combines both models)
# We'll use 'soft' voting to leverage the probability predictions
voting_model = VotingClassifier(
    estimators=[('lr', lr_model), ('rf', rf_model)],
    voting='soft',  # Use soft voting for better performance
    n_jobs=-1       # Use all available CPU cores for faster training
)

# Train all models
lr_model.fit(X_train_res, y_train_res)
rf_model.fit(X_train_res, y_train_res)
voting_model.fit(X_train_res, y_train_res)

print("Model training complete.")

# --- 2. Model Prediction and Evaluation ---
print("\nEvaluating model performance on the test set...")

# Make predictions on the original, untouched test set
lr_pred = lr_model.predict(X_test)
rf_pred = rf_model.predict(X_test)
voting_pred = voting_model.predict(X_test)

# Evaluate Logistic Regression
print("\n--- Logistic Regression Results ---")
print("F1-score:", f1_score(y_test, lr_pred))
print("AUC-ROC:", roc_auc_score(y_test, lr_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, lr_pred))

# Evaluate Random Forest
print("\n--- Random Forest Results ---")
print("F1-score:", f1_score(y_test, rf_pred))
print("AUC-ROC:", roc_auc_score(y_test, rf_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, rf_pred))

# Evaluate Voting Classifier
print("\n--- Voting Classifier Results ---")
print("F1-score:", f1_score(y_test, voting_pred))
print("AUC-ROC:", roc_auc_score(y_test, voting_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, voting_pred))

Starting model training...
Model training complete.

Evaluating model performance on the test set...

--- Logistic Regression Results ---
F1-score: 0.10884353741496598
AUC-ROC: 0.8552949518674395
Confusion Matrix:
 [[158252   7835]
 [   156    488]]

--- Random Forest Results ---
F1-score: 0.6368330464716007
AUC-ROC: 0.7868215311237577
Confusion Matrix:
 [[165939    148]
 [   274    370]]

--- Voting Classifier Results ---
F1-score: 0.2903225806451613
AUC-ROC: 0.8569959237482623
Confusion Matrix:
 [[163975   2112]
 [   176    468]]


**BEST THRESHOLD PREDICTION**

In [14]:
import numpy as np
from sklearn.metrics import f1_score

# Get the probability predictions for the fraudulent class (1)
lr_probs = lr_model.predict_proba(X_test)[:, 1]

# Create a list of possible thresholds to test
thresholds = np.arange(0.1, 1.0, 0.05)

best_f1 = 0
best_threshold = 0

# Loop through each threshold and find the best one
for threshold in thresholds:
    # Convert probabilities to a binary prediction based on the new threshold
    y_pred_threshold = (lr_probs >= threshold).astype(int)
    
    # Calculate the F1-score for this threshold
    f1 = f1_score(y_test, y_pred_threshold)
    
    # If this F1-score is the best we've seen, save it
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold
        
print(f"Optimal F1-score for Logistic Regression: {best_f1:.4f} at threshold: {best_threshold:.2f}")

Optimal F1-score for Logistic Regression: 0.2352 at threshold: 0.95


**HYPERTUNING LOGISTIC REGRESSION**


In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix

# Define a much smaller, more focused grid for faster results
param_grid = {
    'C': [0.1, 1],  # Test two key values for regularization strength
    'penalty': ['l1', 'l2']   # Test both L1 and L2 penalties
}

# Create a Logistic Regression classifier instance
lr_model = LogisticRegression(random_state=42, solver='liblinear')

# Set up GridSearchCV with the Logistic Regression model and the parameter grid
grid_search_lr = GridSearchCV(estimator=lr_model, param_grid=param_grid, cv=5, scoring='f1', n_jobs=6, verbose=1)

# Fit the grid search to your resampled training data
print("Starting GridSearchCV for Logistic Regression with a smaller grid...")
grid_search_lr.fit(X_train_res, y_train_res)
print("GridSearchCV complete.")

# Print the best hyperparameters and the corresponding F1-score
print(f"Best Hyperparameters for LR: {grid_search_lr.best_params_}")
print(f"Best F1-score from LR Grid Search: {grid_search_lr.best_score_}")

# Get the best model
best_lr_model = grid_search_lr.best_estimator_

# Evaluate the best model on the untouched test set
best_lr_pred = best_lr_model.predict(X_test)

# Evaluate the final, fine-tuned model
print("\n--- Fine-Tuned Logistic Regression Results ---")
print("F1-score:", f1_score(y_test, best_lr_pred))
print("AUC-ROC:", roc_auc_score(y_test, best_lr_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, best_lr_pred))

Starting GridSearchCV for Logistic Regression with a smaller grid...
Fitting 5 folds for each of 4 candidates, totalling 20 fits


3 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\divya\AppData\Local\Programs\Python\Python314\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\divya\AppData\Local\Programs\Python\Python314\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\divya\AppData\Local\Programs\Python\Python314\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1247, in fit
    X, y = validate_data(
      

GridSearchCV complete.
Best Hyperparameters for LR: {'C': 1, 'penalty': 'l1'}
Best F1-score from LR Grid Search: 0.9485727222369041

--- Fine-Tuned Logistic Regression Results ---
F1-score: 0.12684599156118143
AUC-ROC: 0.8540025765513076
Confusion Matrix:
 [[159628   6459]
 [   163    481]]
