In [1]:
import pandas as pd
import numpy as np

from sklearn.base import clone
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold, KFold, cross_val_score
# from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.utils import shuffle
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFECV

# Scikit-learn Metrics
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score, accuracy_score, pairwise_distances, confusion_matrix, make_scorer, roc_curve, f1_score
import sklearn.metrics as skm

# Scikit-learn Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
# Combined from both lists
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.neural_network import MLPClassifier

from imblearn.combine import SMOTEENN

In [2]:
data = pd.read_csv("fraud.csv")
data.shape

FileNotFoundError: [Errno 2] No such file or directory: 'fraud.csv'

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(
    ['FraudFound_P'], axis=1), data['FraudFound_P'], test_size=0.2, random_state=42, stratify=data['FraudFound_P'])
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)


In [None]:
import lightgbm as lgb

# LigthGBM
lightgbm_model = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, random_state=42, verbose=-1)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rfecv = RFECV(estimator=lightgbm_model, step=1, cv=skf, scoring='f1', n_jobs=-1)
rfecv.fit(X_train, y_train)

In [None]:
optimal_num_features =rfecv.n_features_
print(f"Optimal number of features: {optimal_num_features}")

Optimal number of features: 27


In [None]:
X_selected = rfecv.transform(X_train)

In [None]:
selected_feature_indices = np.where(rfecv.support_)[0]
selected_feature_names = X_train.columns[selected_feature_indices]
X_rfecv = pd.DataFrame(X_selected, columns=selected_feature_names)

In [None]:
X_rfecv.head()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,Age,...,PastNumberOfClaims,AgeOfVehicle,AgentType,NumberOfSuppliments,AddressChange_Claim,BasePolicy,AgeVehicleRatio,NoPoliceReport,Make_label_encoded,Make_frequency_encoded
0,12.0,3.0,5.0,1.0,2.0,12.0,4.0,1.0,0.0,26.0,...,3.0,6.0,0.0,0.0,0.0,2.0,4.714286,1.0,13.0,0.248833
1,8.0,2.0,2.0,1.0,3.0,8.0,2.0,1.0,0.0,61.0,...,3.0,8.0,0.0,6.0,0.0,2.0,6.444444,1.0,6.0,0.181647
2,11.0,4.0,6.0,1.0,1.0,11.0,4.0,1.0,0.0,68.0,...,3.0,8.0,0.0,0.0,0.0,1.0,7.777778,1.0,13.0,0.248833
3,6.0,4.0,4.0,1.0,5.0,6.0,4.0,1.0,0.0,52.0,...,5.0,7.0,0.0,4.0,0.0,1.0,5.75,1.0,6.0,0.181647
4,4.0,2.0,6.0,1.0,3.0,4.0,2.0,0.0,0.0,42.0,...,3.0,6.0,0.0,6.0,0.0,1.0,5.428571,1.0,17.0,0.202399


In [None]:
model_SVC = SVC(cache_size = 500)

svc_model = Pipeline([
    ('smoteenn', SMOTEENN(random_state=42)),
    ('pca', PCA(n_components=0.95, random_state=42)),
    ('svc', model_SVC)
])

cv_scores = cross_val_score(svc_model , X_rfecv, y_train, cv=skf, scoring='f1', n_jobs=-1)
print("Baseline CV F1-mean:", cv_scores.mean())
print("Baseline CV F1-std:", cv_scores.std())

Baseline CV F1-mean: 0.11283228769752549
Baseline CV F1-std: 2.6430288932761298e-05


In [None]:
!pip install optuna



In [None]:
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner

In [None]:
def objective(trial):

  param_grid = [{'svc__kernel': ["linear"], 'svc__gamma': [0.1, 1, 10, 100], 'svc__C': [0.1, 1, 10, 100, 1000], 'svc__degree': [0, 1, 2, 3, 4, 5, 6],
                'svc__random_state': [24]}]

  params = {
      'gamma': trial.suggest_float('gamma', 0, 1),
      'C': trial.suggest_float('C', 10, 30),
      'degree': trial.suggest_int('degree', 50, 75),
      'cache_size' : 500,
      'random_state': 42
  }


  model = SVC(**params)
  pipeline = Pipeline([
      ('sampling', SMOTEENN(random_state=42)),
      ('pca', PCA(n_components=0.95, random_state=42)),
      ('classification', model)
  ])

  stratified_kfold = StratifiedKFold(n_splits=3,
                                  shuffle=True,
                                  random_state=42)
  f1_scorer = make_scorer(f1_score, average="macro")
  score = cross_val_score(pipeline, X_rfecv, y_train,
                          scoring=f1_scorer, cv=stratified_kfold, n_jobs=-1)
  auc_mean = np.mean(score)
  return auc_mean

study_SVC = optuna.create_study(direction='maximize', sampler=TPESampler())
study_SVC.optimize(objective, n_trials=100, show_progress_bar=True, n_jobs=-1)

[I 2025-11-09 09:16:55,138] A new study created in memory with name: no-name-f767bf79-549e-48b8-92dc-04abe74008bd


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-11-09 09:17:24,343] Trial 0 finished with value: 0.3008387317758531 and parameters: {'gamma': 0.47608188228683557, 'C': 23.134484285956525, 'degree': 51}. Best is trial 0 with value: 0.3008387317758531.
[I 2025-11-09 09:17:34,738] Trial 1 finished with value: 0.3361146961397055 and parameters: {'gamma': 0.2007198591101198, 'C': 29.712220027450645, 'degree': 50}. Best is trial 1 with value: 0.3361146961397055.
[I 2025-11-09 09:17:51,280] Trial 2 finished with value: 0.28414817084742544 and parameters: {'gamma': 0.5718611369197846, 'C': 18.068231810530385, 'degree': 51}. Best is trial 1 with value: 0.3361146961397055.
[I 2025-11-09 09:18:00,806] Trial 3 finished with value: 0.3035773715833703 and parameters: {'gamma': 0.43700482439154453, 'C': 26.175827266308417, 'degree': 55}. Best is trial 1 with value: 0.3361146961397055.
[I 2025-11-09 09:18:16,302] Trial 4 finished with value: 0.3295009463317982 and parameters: {'gamma': 0.25436976714689774, 'C': 19.748539679725575, 'degree':

In [None]:
sv = study_SVC.best_params
sv['probability'] = True
print(sv)

{'gamma': 0.01536264874815837, 'C': 18.340634305957956, 'degree': 68, 'probability': True}


In [None]:
best = SVC(**sv)
best.fit(X_train, y_train)

y_pred_svc = best.predict(X_val)
y_proba_svc = best.predict_proba(X_test)[:,1]

print("best params:", study_SVC.best_params)
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_proba_svc):.4f}")
print(f"PR-AUC Score: {average_precision_score(y_test, y_proba_svc):.4f}")
print("Confusion Matrix:")
print(pd.DataFrame(confusion_matrix(y_val, y_pred_svc)))
print("Classification Report:")
pd.DataFrame(classification_report(y_val, y_pred_svc, output_dict=True)).transpose()

best params: {'gamma': 0.01536264874815837, 'C': 18.340634305957956, 'degree': 68}
ROC-AUC Score: 0.5936
PR-AUC Score: 0.1050
Confusion Matrix:
      0  1
0  2319  1
1   148  0
Classification Report:


Unnamed: 0,precision,recall,f1-score,support
0,0.940008,0.999569,0.968874,2320.0
1,0.0,0.0,0.0,148.0
accuracy,0.939627,0.939627,0.939627,0.939627
macro avg,0.470004,0.499784,0.484437,2468.0
weighted avg,0.883638,0.939627,0.910773,2468.0


In [None]:
print("hello")

hello
