In [1]:
import pandas as pd
import numpy as np

from sklearn.base import clone
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold, KFold, cross_val_score
# from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.utils import shuffle
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFECV

# Scikit-learn Metrics
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score, accuracy_score, pairwise_distances, confusion_matrix, make_scorer, roc_curve, f1_score
import sklearn.metrics as skm

# Scikit-learn Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
# Combined from both lists
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.neural_network import MLPClassifier

from imblearn.combine import SMOTEENN

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
pwd

'/content'

In [3]:
data = pd.read_csv("fraud.csv")
data.shape

(15420, 45)

In [29]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(
    ['FraudFound_P'], axis=1), data['FraudFound_P'], test_size=0.2, random_state=42, stratify=data['FraudFound_P'])
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)


In [30]:
import lightgbm as lgb

# LigthGBM
lightgbm_model = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, random_state=42, verbose=-1)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rfecv = RFECV(estimator=lightgbm_model, step=1, cv=skf, scoring='f1', n_jobs=-1)
rfecv.fit(X_train, y_train)



In [31]:
optimal_num_features =rfecv.n_features_
print(f"Optimal number of features: {optimal_num_features}")

Optimal number of features: 27


In [32]:
X_selected = rfecv.transform(X_train)

In [33]:
selected_feature_indices = np.where(rfecv.support_)[0]
selected_feature_names = X_train.columns[selected_feature_indices]
X_rfecv = pd.DataFrame(X_selected, columns=selected_feature_names)

In [34]:
X_rfecv.head()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,Age,...,PastNumberOfClaims,AgeOfVehicle,AgentType,NumberOfSuppliments,AddressChange_Claim,BasePolicy,AgeVehicleRatio,NoPoliceReport,Make_label_encoded,Make_frequency_encoded
0,12.0,3.0,5.0,1.0,2.0,12.0,4.0,1.0,0.0,26.0,...,3.0,6.0,0.0,0.0,0.0,2.0,4.714286,1.0,13.0,0.248833
1,8.0,2.0,2.0,1.0,3.0,8.0,2.0,1.0,0.0,61.0,...,3.0,8.0,0.0,6.0,0.0,2.0,6.444444,1.0,6.0,0.181647
2,11.0,4.0,6.0,1.0,1.0,11.0,4.0,1.0,0.0,68.0,...,3.0,8.0,0.0,0.0,0.0,1.0,7.777778,1.0,13.0,0.248833
3,6.0,4.0,4.0,1.0,5.0,6.0,4.0,1.0,0.0,52.0,...,5.0,7.0,0.0,4.0,0.0,1.0,5.75,1.0,6.0,0.181647
4,4.0,2.0,6.0,1.0,3.0,4.0,2.0,0.0,0.0,42.0,...,3.0,6.0,0.0,6.0,0.0,1.0,5.428571,1.0,17.0,0.202399


In [35]:
model_SVC = SVC(cache_size = 500)

svc_model = Pipeline([
    ('smoteenn', SMOTEENN(random_state=42)),
    ('pca', PCA(n_components=0.95, random_state=42)),
    ('svc', model_SVC)
])

cv_scores = cross_val_score(svc_model , X_rfecv, y_train, cv=skf, scoring='f1', n_jobs=-1)
print("Baseline CV F1-mean:", cv_scores.mean())
print("Baseline CV F1-std:", cv_scores.std())

Baseline CV F1-mean: 0.11283228769752549
Baseline CV F1-std: 2.6430288932761298e-05


In [36]:
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner

In [37]:
def objective(trial):

  param_grid = [{'svc__kernel': ["linear"], 'svc__gamma': [0.1, 1, 10, 100], 'svc__C': [0.1, 1, 10, 100, 1000], 'svc__degree': [0, 1, 2, 3, 4, 5, 6],
                'svc__random_state': [24]}]

  params = {
      'gamma': trial.suggest_float('gamma', 0, 1),
      'C': trial.suggest_float('C', 10, 30),
      'degree': trial.suggest_int('degree', 50, 75),
      'cache_size' : 500,
      'random_state': 42
  }


  model = SVC(**params)
  pipeline = Pipeline([
      ('sampling', SMOTEENN(random_state=42)),
      ('pca', PCA(n_components=0.95, random_state=42)),
      ('classification', model)
  ])

  stratified_kfold = StratifiedKFold(n_splits=3,
                                  shuffle=True,
                                  random_state=42)
  f1_scorer = make_scorer(f1_score, average="macro")
  score = cross_val_score(pipeline, X_rfecv, y_train,
                          scoring=f1_scorer, cv=stratified_kfold, n_jobs=-1)
  auc_mean = np.mean(score)
  return auc_mean

study_SVC = optuna.create_study(direction='maximize', sampler=TPESampler())
study_SVC.optimize(objective, n_trials=100, show_progress_bar=True, n_jobs=-1)

[I 2025-10-29 15:28:43,439] A new study created in memory with name: no-name-6f31971f-f450-41df-8a94-784926e8ddad


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-10-29 15:29:00,435] Trial 1 finished with value: 0.2703981615350222 and parameters: {'gamma': 0.6882090754774116, 'C': 27.51492516925372, 'degree': 58}. Best is trial 1 with value: 0.2703981615350222.
[I 2025-10-29 15:29:10,387] Trial 0 finished with value: 0.24864798948515723 and parameters: {'gamma': 0.8910366358911546, 'C': 11.289125807253352, 'degree': 65}. Best is trial 1 with value: 0.2703981615350222.
[I 2025-10-29 15:29:26,931] Trial 2 finished with value: 0.26738727797436385 and parameters: {'gamma': 0.7183130426211466, 'C': 18.780569933507884, 'degree': 73}. Best is trial 1 with value: 0.2703981615350222.
[I 2025-10-29 15:29:36,315] Trial 3 finished with value: 0.33657528536774256 and parameters: {'gamma': 0.20989667675187973, 'C': 25.13838677260551, 'degree': 50}. Best is trial 3 with value: 0.33657528536774256.
[I 2025-10-29 15:29:54,033] Trial 4 finished with value: 0.34361751564966897 and parameters: {'gamma': 0.17524938790986733, 'C': 12.172406823607167, 'degree'

In [39]:
best = SVC(**study_SVC.best_params)
best.fit(X_train, y_train)

y_pred_svc = best.predict(X_test)
# y_proba_svc = best.predict_proba(X_test)[:,1]

print("best params:", study_SVC.best_params)
# print(f"ROC-AUC Score: {roc_auc_score(y_test, y_proba_svc):.4f}")
# print(f"PR-AUC Score: {average_precision_score(y_test, y_proba_svc):.4f}")
print("Confusion Matrix:")
print(pd.DataFrame(confusion_matrix(y_test, y_pred_svc)))
print("Classification Report:")
pd.DataFrame(classification_report(y_test, y_pred_svc, output_dict=True)).transpose()

best params: {'gamma': 0.016505027847736282, 'C': 23.92426642456753, 'degree': 54}
Confusion Matrix:
      0  1
0  2898  1
1   184  1
Classification Report:


Unnamed: 0,precision,recall,f1-score,support
0,0.940299,0.999655,0.969069,2899.0
1,0.5,0.005405,0.010695,185.0
accuracy,0.940013,0.940013,0.940013,0.940013
macro avg,0.720149,0.50253,0.489882,3084.0
weighted avg,0.913886,0.940013,0.911579,3084.0


In [40]:
print("hello")

hello
