In [3]:
import os
import pandas as pd

# Get the current working directory
current_dir = os.getcwd()

# Define the relative path to your CSV file
csv_filename = 'output3.csv'

# Construct the full path based on the operating system
csv_path = os.path.join(current_dir, csv_filename)

# Read the CSV file
dfEU = pd.read_csv(csv_path)


In [4]:
import pandas as pd


dfEU['target'] = dfEU['js_abr'] + dfEU['js_hom'] + dfEU['imp_imm']

# Apply the condition to label as 0 or 1
dfEU['target'] = dfEU['target'].apply(lambda x: 1 if x >= 12 else 0)

In [5]:
dfEU.drop(['js_abr', 'imp_imm', 'js_hom'], axis=1, inplace=True)

In [6]:
# Assuming dfEU is your DataFrame
value_counts = dfEU['target'].value_counts()
print(value_counts)


target
1    6997
0    2057
Name: count, dtype: int64


### RF - PARAM GRID - METHODS OK 

In [7]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np

def downsample(X_train, y_train, sampling_ratio):
    undersample = RandomUnderSampler(sampling_strategy=sampling_ratio)
    X_resampled, y_resampled = undersample.fit_resample(X_train, y_train)
    return X_resampled, y_resampled

# Separate features and target variable
X = dfEU.drop('target', axis=1)  # Features
y = dfEU['target']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Downsample the training data
X_train_down, y_train_down = downsample(X_train, y_train, 0.7)

# Train the classifier
classifier = RandomForestClassifier(n_estimators=50, max_depth=20, min_samples_leaf=4, min_samples_split=2, random_state=42)
classifier.fit(X_train_down, y_train_down)

# Evaluate classifier on the test set
y_pred = classifier.predict(X_test)

# Calculate AUC
auc_score = roc_auc_score(y_test, y_pred)

# Calculate overall accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate Sensitivity, Specificity, Precision, and F1 score
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
precision = tp / (tp + fp)
f1_score = 2 * (precision * sensitivity) / (precision + sensitivity)

# Print classification report
print("Classification Report - Downsampled (Sampling Ratio 0.7):")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("\nConfusion Matrix - Downsampled (Sampling Ratio 0.7):")
print(confusion_matrix(y_test, y_pred))

# Print AUC, overall accuracy, Sensitivity, Specificity, Precision, and F1 score
print(f"\nAUC - Downsampled (Sampling Ratio 0.7): {auc_score}")
print(f"Overall Accuracy - Downsampled (Sampling Ratio 0.7): {accuracy}")
print(f"Sensitivity - Downsampled (Sampling Ratio 0.7): {sensitivity}")
print(f"Specificity - Downsampled (Sampling Ratio 0.7): {specificity}")
print(f"Precision - Downsampled (Sampling Ratio 0.7): {precision}")
print(f"F1 Score - Downsampled (Sampling Ratio 0.7): {f1_score}")


Classification Report - Downsampled (Sampling Ratio 0.7):
              precision    recall  f1-score   support

           0       0.69      0.77      0.73       444
           1       0.92      0.89      0.91      1367

    accuracy                           0.86      1811
   macro avg       0.81      0.83      0.82      1811
weighted avg       0.87      0.86      0.86      1811


Confusion Matrix - Downsampled (Sampling Ratio 0.7):
[[ 344  100]
 [ 151 1216]]

AUC - Downsampled (Sampling Ratio 0.7): 0.8321569557853392
Overall Accuracy - Downsampled (Sampling Ratio 0.7): 0.8614025400331309
Sensitivity - Downsampled (Sampling Ratio 0.7): 0.8895391367959035
Specificity - Downsampled (Sampling Ratio 0.7): 0.7747747747747747
Precision - Downsampled (Sampling Ratio 0.7): 0.9240121580547113
F1 Score - Downsampled (Sampling Ratio 0.7): 0.9064480059634737


### KNN - PARAM GRID - METHODS OK

In [8]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

def downsample(X_train, y_train, sampling_ratio):
    undersample = RandomUnderSampler(sampling_strategy=sampling_ratio)
    X_resampled, y_resampled = undersample.fit_resample(X_train, y_train)
    return X_resampled, y_resampled

# Separate features and target variable
X = dfEU.drop('target', axis=1)  # Features
y = dfEU['target']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Downsample the training data with ratio=0.7
X_train_down, y_train_down = downsample(X_train, y_train, 0.7)

# Best parameters
best_params = {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}

# Create KNN classifier with best parameters
knn = KNeighborsClassifier(**best_params)

# Fit the model on the downsampled training data
knn.fit(X_train_down, y_train_down)

# Evaluate the model on the test set
y_pred = knn.predict(X_test)

# Calculate AUC
auc_score = roc_auc_score(y_test, y_pred)

# Calculate overall accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate Sensitivity, Specificity, Precision, and F1 score
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
precision = tp / (tp + fp)
f1_score = 2 * (precision * sensitivity) / (precision + sensitivity)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Print AUC, overall accuracy, Sensitivity, Specificity, Precision, and F1 score
print(f"\nAUC: {auc_score}")
print(f"Overall Accuracy: {accuracy}")
print(f"Sensitivity: {sensitivity}")
print(f"Specificity: {specificity}")
print(f"Precision: {precision}")
print(f"F1 Score: {f1_score}")


Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.70      0.66       444
           1       0.90      0.87      0.88      1367

    accuracy                           0.83      1811
   macro avg       0.76      0.78      0.77      1811
weighted avg       0.83      0.83      0.83      1811


Confusion Matrix:
[[ 310  134]
 [ 181 1186]]

AUC: 0.7828957340661803
Overall Accuracy: 0.8260629486471562
Sensitivity: 0.8675932699341624
Specificity: 0.6981981981981982
Precision: 0.8984848484848484
F1 Score: 0.8827688872348344


### SVM - LINEAR KERNEL - PARAM GRID - METHODS OK

In [9]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

def downsample(X_train, y_train, sampling_ratio):
    undersample = RandomUnderSampler(sampling_strategy=sampling_ratio)
    X_resampled, y_resampled = undersample.fit_resample(X_train, y_train)
    return X_resampled, y_resampled

# Separate features and target variable
X = dfEU.drop('target', axis=1)  # Features
y = dfEU['target']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform downsampling for ratio=0.7
ratio = 0.7
X_train_down, y_train_down = downsample(X_train, y_train, ratio)

# Train SVM classifier using downsampled data
classifier = SVC(C=0.1)  # Setting C=0.1
classifier.fit(X_train_down, y_train_down)

# Evaluate classifier on the test set
y_pred = classifier.predict(X_test)

# Calculate AUC
auc_score = roc_auc_score(y_test, y_pred)

# Calculate overall accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision, recall, and F1 score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Calculate Sensitivity and Specificity
conf_matrix = confusion_matrix(y_test, y_pred)
TN, FP, FN, TP = conf_matrix.ravel()
sensitivity = TP / (TP + FN)
specificity = TN / (TN + FP)

# Print classification report
print(f"Classification Report - Downsampled (Sampling Ratio {ratio}):")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print(f"\nConfusion Matrix - Downsampled (Sampling Ratio {ratio}):")
print(conf_matrix)

# Print AUC, overall accuracy, precision, recall, F1 score, Sensitivity, and Specificity
print(f"\nAUC - Downsampled (Sampling Ratio {ratio}): {auc_score}")
print(f"Overall Accuracy - Downsampled (Sampling Ratio {ratio}): {accuracy}")
print(f"Precision - Downsampled (Sampling Ratio {ratio}): {precision}")
print(f"Recall/Sensitivity - Downsampled (Sampling Ratio {ratio}): {sensitivity}")
print(f"Specificity - Downsampled (Sampling Ratio {ratio}): {specificity}")
print(f"F1 Score - Downsampled (Sampling Ratio {ratio}): {f1}")


Classification Report - Downsampled (Sampling Ratio 0.7):
              precision    recall  f1-score   support

           0       0.67      0.77      0.72       444
           1       0.92      0.87      0.90      1367

    accuracy                           0.85      1811
   macro avg       0.79      0.82      0.81      1811
weighted avg       0.86      0.85      0.85      1811


Confusion Matrix - Downsampled (Sampling Ratio 0.7):
[[ 344  100]
 [ 172 1195]]

AUC - Downsampled (Sampling Ratio 0.7): 0.8244759023837297
Overall Accuracy - Downsampled (Sampling Ratio 0.7): 0.8498067366096079
Precision - Downsampled (Sampling Ratio 0.7): 0.9227799227799228
Recall/Sensitivity - Downsampled (Sampling Ratio 0.7): 0.8741770299926848
Specificity - Downsampled (Sampling Ratio 0.7): 0.7747747747747747
F1 Score - Downsampled (Sampling Ratio 0.7): 0.8978211870773855


### SVM - RBF KERNEL - PARAM GRID - METHODS OK

In [10]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC

def downsample(X_train, y_train, sampling_ratio):
    undersample = RandomUnderSampler(sampling_strategy=sampling_ratio)
    X_resampled, y_resampled = undersample.fit_resample(X_train, y_train)
    return X_resampled, y_resampled

def calculate_metrics(y_true, y_pred):
    auc = roc_auc_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn + fp)
    return auc, accuracy, precision, recall, f1, specificity

# Separate features and target variable
X = dfEU.drop('target', axis=1)  # Features
y = dfEU['target']  # Target variable


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Downsample the training data with ratio=0.7
X_train_down, y_train_down = downsample(X_train, y_train, 0.7)

# Define parameter grid for SVM with only RBF kernel
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [0.1, 0.01, 0.001]
}

# Create SVM classifier with RBF kernel
svm = SVC(kernel='rbf')

# Train the SVM classifier with the provided parameters
svm.set_params(C=1, gamma=0.001)
svm.fit(X_train_down, y_train_down)

# Predict on the test set
y_pred = svm.predict(X_test)

# Calculate evaluation metrics
auc_score, accuracy, precision, recall, f1, specificity = calculate_metrics(y_test, y_pred)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Print evaluation metrics
print("\nEvaluation Metrics:")
print(f"AUC: {auc_score}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall (Sensitivity): {recall}")
print(f"Specificity: {specificity}")
print(f"F1 Score: {f1}")


Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.80      0.74       444
           1       0.93      0.88      0.91      1367

    accuracy                           0.86      1811
   macro avg       0.81      0.84      0.82      1811
weighted avg       0.87      0.86      0.86      1811


Confusion Matrix:
[[ 353   91]
 [ 160 1207]]

Evaluation Metrics:
AUC: 0.8390002108912131
Accuracy: 0.8614025400331309
Precision: 0.9298921417565486
Recall (Sensitivity): 0.8829553767373811
Specificity: 0.795045045045045
F1 Score: 0.9058161350844278


In [13]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC

def downsample(X_train, y_train, sampling_ratio):
    undersample = RandomUnderSampler(sampling_strategy=sampling_ratio)
    X_resampled, y_resampled = undersample.fit_resample(X_train, y_train)
    return X_resampled, y_resampled

# Separate features and target variable
X = dfEU.drop('target', axis=1)  # Features
y = dfEU['target']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Downsample the training data with ratio=0.7
X_train_down, y_train_down = downsample(X_train, y_train, 0.7)

# Define parameter grid for SVM with only polynomial kernel
param_grid = {
    'C': [0.1],
    'degree': [3]
}

# Create SVM classifier with polynomial kernel
svm = SVC(kernel='poly')

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2)
grid_search.fit(X_train_down, y_train_down)

# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Evaluate the best estimator on the test set
best_estimator = grid_search.best_estimator_
y_pred = best_estimator.predict(X_test)

# Calculate AUC
auc_score = roc_auc_score(y_test, y_pred)

# Calculate Sensitivity (Recall)
sensitivity = recall_score(y_test, y_pred)

# Calculate Specificity
conf_matrix = confusion_matrix(y_test, y_pred)
specificity = conf_matrix[0,0] / (conf_matrix[0,0] + conf_matrix[0,1])

# Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate Precision
precision = precision_score(y_test, y_pred)

# Calculate F1 Score
f1 = f1_score(y_test, y_pred)

# Print AUC, Sensitivity, Specificity, Accuracy, Precision, and F1 Score
print(f"AUC: {auc_score}")
print(f"Sensitivity: {sensitivity}")
print(f"Specificity: {specificity}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"F1 Score: {f1}")


Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ....................................C=0.1, degree=3; total time=   0.3s
[CV] END ....................................C=0.1, degree=3; total time=   0.3s
[CV] END ....................................C=0.1, degree=3; total time=   0.3s
[CV] END ....................................C=0.1, degree=3; total time=   0.3s
[CV] END ....................................C=0.1, degree=3; total time=   0.3s
Best Parameters: {'C': 0.1, 'degree': 3}
Best Score: 0.8338002059061171
AUC: 0.8384993442601344
Sensitivity: 0.8639356254572056
Specificity: 0.8130630630630631
Accuracy: 0.8514632799558255
Precision: 0.9343354430379747
F1 Score: 0.8977575066514634


### LOGISTIC REGRESSION - PARAM GRID - METHODS OK

In [15]:
from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

# Define the pipeline
pipeline = Pipeline([
    ('sampling', RandomUnderSampler(sampling_strategy=0.8)),
    ('classifier', LogisticRegression(C=0.01))
])

# Define the parameter grid
param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l1', 'l2']
}

# Separate features and target variable
X = dfEU.drop('target', axis=1)  # Features
y = dfEU['target']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform grid search cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best estimator
best_estimator = grid_search.best_estimator_

# Evaluate the best estimator on the test set
y_pred = best_estimator.predict(X_test)

# Calculate AUC
auc_score = roc_auc_score(y_test, y_pred)

# Calculate overall accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred)

# Calculate sensitivity (recall)
sensitivity = recall_score(y_test, y_pred)

# Calculate specificity
specificity = recall_score(y_test, y_pred, pos_label=0)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)

# Print classification report
print("Best Estimator:")
print(best_estimator)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Print AUC, overall accuracy, precision, sensitivity, specificity, and F1 score
print("\nAUC:", auc_score)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Sensitivity:", sensitivity)
print("Specificity:", specificity)
print("F1 Score:", f1)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Estimator:
Pipeline(steps=[('sampling', RandomUnderSampler(sampling_strategy=0.8)),
                ('classifier', LogisticRegression(C=0.01))])

Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.82      0.73       444
           1       0.94      0.86      0.90      1367

    accuracy                           0.85      1811
   macro avg       0.80      0.84      0.81      1811
weighted avg       0.87      0.85      0.86      1811


Confusion Matrix:
[[ 364   80]
 [ 190 1177]]

AUC: 0.84041466484773
Accuracy: 0.8509110988404197
Precision: 0.9363564041368337
Sensitivity: 0.8610095098756401
Specificity: 0.8198198198198198
F1 Score: 0.8971036585365854


### GRADIENT BOOSTING - PARAM GRID - METHODS OK

In [19]:
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import GradientBoostingClassifier

# Define a function for downsampling
def downsample(X_train, y_train, sampling_ratio):
    undersample = RandomUnderSampler(sampling_strategy=sampling_ratio)
    X_resampled, y_resampled = undersample.fit_resample(X_train, y_train)
    return X_resampled, y_resampled

# Separate features and target variable
X = dfEU.drop('target', axis=1)  # Features
y = dfEU['target']  # Target variable


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid
param_grid = {
    'classifier__n_estimators': [50, 100, 150],
    'classifier__learning_rate': [0.01, 0.1, 0.5],
    'classifier__max_depth': [3, 4, 5]
}

# Create a pipeline with the downsampling step and the classifier
pipeline = Pipeline([
    ('sampling', RandomUnderSampler(sampling_strategy=0.9)),
    ('classifier', GradientBoostingClassifier())
])

# Define the scorer
scorer = {
    'AUC': 'roc_auc',
    'Sensitivity': make_scorer(recall_score, pos_label=1),
    'Specificity': make_scorer(recall_score, pos_label=0),
    'Accuracy': 'accuracy',
    'Precision': 'precision',
    'F1 score': 'f1'
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, scoring=scorer, cv=5, refit='AUC')
grid_search.fit(X_train, y_train)

# Print the best parameters and the corresponding AUC score
print("Best Parameters:", grid_search.best_params_)
print("Best AUC Score:", grid_search.best_score_)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate and print the specified metrics
print("AUC Score on Test Set:", roc_auc_score(y_test, y_pred))
print("Sensitivity on Test Set:", recall_score(y_test, y_pred))
print("Specificity on Test Set:", recall_score(y_test, y_pred, pos_label=0))
print("Accuracy on Test Set:", accuracy_score(y_test, y_pred))
print("Precision on Test Set:", precision_score(y_test, y_pred))
print("F1 score on Test Set:", f1_score(y_test, y_pred))


KeyboardInterrupt: 

sopra stoppato io

In [21]:
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import GradientBoostingClassifier

# Define a function for downsampling
def downsample(X_train, y_train, sampling_ratio):
    undersample = RandomUnderSampler(sampling_strategy=sampling_ratio)
    X_resampled, y_resampled = undersample.fit_resample(X_train, y_train)
    return X_resampled, y_resampled

# Separate features and target variable
X = dfEU.drop('target', axis=1)  # Features
y = dfEU['target']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the pipeline with best parameters
pipeline = Pipeline([
    ('sampling', RandomUnderSampler(sampling_strategy=0.9)),  # Random undersampling with ratio 0.9
    ('classifier', GradientBoostingClassifier(learning_rate=0.1, max_depth=3, n_estimators=100))  # Gradient Boosting Classifier
])

# Define the scorer
scorer = {
    'AUC': 'roc_auc',
    'Sensitivity': make_scorer(recall_score, pos_label=1),
    'Specificity': make_scorer(recall_score, pos_label=0),
    'Accuracy': 'accuracy',
    'Precision': 'precision',
    'F1 score': 'f1'
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, scoring=scorer, cv=5, refit='AUC')
grid_search.fit(X_train, y_train)

# Print the best parameters and the corresponding AUC score
print("Best Parameters:", grid_search.best_params_)
print("Best AUC Score:", grid_search.best_score_)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate and print the specified metrics
print("AUC Score on Test Set:", roc_auc_score(y_test, y_pred))
print("Sensitivity on Test Set:", recall_score(y_test, y_pred))
print("Specificity on Test Set:", recall_score(y_test, y_pred, pos_label=0))
print("Accuracy on Test Set:", accuracy_score(y_test, y_pred))
print("Precision on Test Set:", precision_score(y_test, y_pred))
print("F1 score on Test Set:", f1_score(y_test, y_pred))


Best Parameters: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__n_estimators': 100}
Best AUC Score: 0.9153216771993173
AUC Score on Test Set: 0.8374020509170473
Sensitivity on Test Set: 0.8617410387710315
Specificity on Test Set: 0.8130630630630631
Accuracy on Test Set: 0.8498067366096079
Precision on Test Set: 0.9341792228390167
F1 score on Test Set: 0.8964992389649924


### LDAM - **NO** PARAM GRID - METHODS OK

In [22]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

def downsample(X_train, y_train, sampling_ratio):
    undersample = RandomUnderSampler(sampling_strategy=sampling_ratio)
    X_resampled, y_resampled = undersample.fit_resample(X_train, y_train)
    return X_resampled, y_resampled

# Separate features and target variable
X = dfEU.drop('target', axis=1)  # Features
y = dfEU['target']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform downsampling for sampling ratio 0.7
ratio = 0.7
X_train_down, y_train_down = downsample(X_train, y_train, ratio)

# Initialize LDA classifier
lda = LinearDiscriminantAnalysis()

# Fit LDA classifier using downsampled data
lda.fit(X_train_down, y_train_down)

# Predict using LDA classifier
y_pred_down = lda.predict(X_test)

# Calculate AUC
auc_score = roc_auc_score(y_test, y_pred_down)

# Calculate overall accuracy
accuracy = accuracy_score(y_test, y_pred_down)

# Calculate Sensitivity (Recall)
sensitivity = recall_score(y_test, y_pred_down)

# Calculate Specificity
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_down).ravel()
specificity = tn / (tn + fp)

# Calculate Precision
precision = precision_score(y_test, y_pred_down)

# Calculate F1 score
f1 = f1_score(y_test, y_pred_down)

# Print classification report
print(f"Classification Report - Downsampled (Sampling Ratio {ratio}):")
print(classification_report(y_test, y_pred_down))

# Print confusion matrix
print(f"\nConfusion Matrix - Downsampled (Sampling Ratio {ratio}):")
print(confusion_matrix(y_test, y_pred_down))

# Print AUC, overall accuracy, Sensitivity, Specificity, Precision, and F1 score
print(f"\nAUC - Downsampled (Sampling Ratio {ratio}): {auc_score}")
print(f"Overall Accuracy - Downsampled (Sampling Ratio {ratio}): {accuracy}")
print(f"Sensitivity - Downsampled (Sampling Ratio {ratio}): {sensitivity}")
print(f"Specificity - Downsampled (Sampling Ratio {ratio}): {specificity}")
print(f"Precision - Downsampled (Sampling Ratio {ratio}): {precision}")
print(f"F1 Score - Downsampled (Sampling Ratio {ratio}): {f1}")

print()


Classification Report - Downsampled (Sampling Ratio 0.7):
              precision    recall  f1-score   support

           0       0.66      0.79      0.72       444
           1       0.93      0.86      0.90      1367

    accuracy                           0.85      1811
   macro avg       0.79      0.83      0.81      1811
weighted avg       0.86      0.85      0.85      1811


Confusion Matrix - Downsampled (Sampling Ratio 0.7):
[[ 352   92]
 [ 185 1182]]

AUC - Downsampled (Sampling Ratio 0.7): 0.8287299735726948
Overall Accuracy - Downsampled (Sampling Ratio 0.7): 0.8470458310325787
Sensitivity - Downsampled (Sampling Ratio 0.7): 0.8646671543525969
Specificity - Downsampled (Sampling Ratio 0.7): 0.7927927927927928
Precision - Downsampled (Sampling Ratio 0.7): 0.9277864992150706
F1 Score - Downsampled (Sampling Ratio 0.7): 0.8951154865581219

