In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import time
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
from sklearn import svm
from sklearn import neighbors
from sklearn.tree import DecisionTreeClassifier
from sklearn import ensemble
from sklearn import model_selection
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from imblearn.over_sampling import RandomOverSampler, SMOTE
from scipy.stats import uniform
import skops.io as sio
from imblearn.pipeline import Pipeline as ImbPipeline

In [2]:
#import MIT data 
df_mitbih_train = pd.read_csv('../data/original/mitbih_train.csv', header = None)
df_mitbih_test = pd.read_csv('../data/original/mitbih_test.csv', header = None)

#define train and test set
X_train = df_mitbih_train.drop(187, axis = 1)
y_train = df_mitbih_train[187]

X_test = df_mitbih_test.drop(187, axis = 1)
y_test = df_mitbih_test[187]

In [None]:
# DATA AUGMENTATION: SMOTE


#Models with parameters to be tested -> change for different training procedures with different models

# linear_model.LogisticRegression(), {'clf__C': [0.01, 0.1, 1, 10, 100, 1000]}
# neighbors.KNeighborsClassifier(), {'clf__n_neighbors':range(2, 31), 'clf__metric': ['minkowski', 'manhattan']}
# svm.SVC(), {'clf__C':[0.1,1,10], 'clf__kernel':['rbf','linear', 'poly'], 'clf__gamma':[0.001, 0.1, 0.5]}
# MLPClassifier(max_iter=500, early_stopping=True), {'clf__hidden_layer_sizes': [(50,), (100,), (100,50)],'clf__activation': ['relu','tanh'],'clf__solver': ['adam','sgd'],'clf__alpha': [0.0001,0.001]}


#RandomizedSearchCV to find best hyperparameter combination

start = time.time()


# ------------------------
# 1. Define pipeline to prevent data leakage: oversampling technique only on training not on validation dataset, classifier
# ------------------------
pipeline = ImbPipeline([
    ('oversample', SMOTE(random_state=42)),
    ('clf', linear_model.LogisticRegression()) # change for different model
])


# ------------------------
# 2. Parameters to be tested
# ------------------------
parameters = {'clf__C': [0.01, 0.1, 1, 10, 100, 1000]} # change for different model


# ------------------------
# 3. Cross-Validation: during training split training set in n_splits, one for testing (validation set) the rest for training
# ------------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


# ------------------------
# 4. RandomizedSearchCV
# ------------------------
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=parameters,
    n_iter=6,
    scoring='f1_macro',
    cv=cv,
    n_jobs=-1,
    verbose=2,
    random_state=42,
    return_train_score=True # save f1 scores on training and validation data during cross-validation
)


# ------------------------
# 5. Fit models
# ------------------------
random_search.fit(X_train, y_train)


# ------------------------
# 6. Save RandomSearch results for cross-validation
# ------------------------
results = pd.DataFrame(random_search.cv_results_)
csv_results_cv_name = 'lin_cv_nsplits5_randomsearch_niter6_sm.csv' # change name of csv file for different model
results.to_csv(csv_results_cv_name, index=False)
print('RandomSearch results saved as: ', csv_results_cv_name)

print(f"Best parameters: {random_search.best_params_}")
print(f"Average F1 on validation data over all 5 CV folds for best parameters: {random_search.best_score_:.4f}") # Mean F1 on CV validation folds (best parameters)


# ------------------------
# 7. Application of best model on X_test and save results
# ------------------------
y_pred = random_search.best_estimator_.predict(X_test) 
print('\nClassification Report for X_test/y_test:')
print(classification_report(y_test, y_pred, digits=6))

txt_results_xtest_name = 'lin_cv_nsplits5_randomsearch_niter6_xtest_sm.txt' # change name of txt file for different model

with open(txt_results_xtest_name, 'w') as file:
    file.write("Classifier:")
    file.write('clf = linear_model.LogisticRegression()') # change for used model 

    file.write("\nData augmentation: RandomOversampler\n")

    file.write("\nConfusion Matrix:\n")
    file.write(str(pd.crosstab(y_test, y_pred, colnames=['Predictions'])))

    file.write("\n\nClassification Report for X_test/y_test:\n")
    file.write(classification_report(y_test, y_pred))


# ------------------------
# 8. Save best model
# ------------------------
best_model = random_search.best_estimator_
skops_best_model_name = 'lin_best_model_cv_nsplits5_randomsearch_niter6_sm.skops' # change name of skops file for different model
sio.dump(best_model, skops_best_model_name)
print('Best model saved as: ', skops_best_model_name)



# ------------------------
# 9. Plot learning curve (Average Training vs Validation F1 from cross-validation)
# ------------------------
train_scores = results['mean_train_score'].values
validation_scores = results['mean_test_score'].values
params = [str(p) for p in results['params'].values]
param_indices = np.arange(len(train_scores)) 
png_name_learning_curve_train_val = 'lin_cv_nsplits5_randomsearch_niter6_learning_curve_train_val_sm.png' # change name of png file for different model

plt.figure(figsize=(12,6))
plt.plot(param_indices+1, train_scores, marker='o', label='Train F1')
plt.plot(param_indices+1, validation_scores, marker='x', label='Validation F1')
plt.xlabel("Parameter combination")
plt.ylabel("F1 Score")
plt.title("Learning curve: Average Training and Validation F1 during cross-validation")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(png_name_learning_curve_train_val, dpi=300)
plt.show()


end = time.time()
print(f"\nTotal time: {end - start:.2f} s")
