# Index

1. [Load the data](#7)
1. [Preprocessing the data](#2)
1. [GridSearch for different models](#6)
    1. [Logistic Regression](#8)
    1. [KNN](#9)
    1. [Decision Tree](#10)
    1. [Random Forest](#11)
    1. [MLP](#11)
1. [Comparison of the results](#6)
1. [Test confusion matrix for the best model](#6)

<a id="1"></a>
# 1. Load the data

In [None]:
import pandas as pd

file_path = 'dataset1.csv'
data = pd.read_csv(file_path)

data.head()

In [None]:
data_cleaned = data.drop(columns=['Unnamed: 0'])  # Remove unnamed column

column_names = data_cleaned.columns # Check if other columns make sense
column_names

In [None]:
print(f"Number of duplicate rows: {data.duplicated().sum()}")

In [None]:
missing_values = data_cleaned.isnull().sum()

missing_values

In [None]:
summary_statistics = data_cleaned.describe()
summary_statistics

In [None]:
data_cleaned_info = {
    "Number of Rows": data_cleaned.shape[0],
    "Number of Columns": data_cleaned.shape[1],
    "Column Names": data_cleaned.columns.tolist(),
    "data_cleaned Types": data_cleaned.dtypes.to_dict(),
}

data_cleaned_info

In [None]:
target_distribution = data_cleaned['Target'].value_counts()
target_distribution

# 2. Preprocessing the data

In [None]:
# Split data into features (X) and target variable (y)
X = data_cleaned.drop(columns=['Target'])
y = data_cleaned['Target']

# Encode target variable
y = y.map({'R': 1, 'NR': 0})

In [None]:
from sklearn.model_selection import train_test_split

# Split into training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"Training data size: {X_train.shape}")
print(f"Test data size: {X_test.shape}")

<a id="2"></a>
# 3. GridSearch for the different models

<a id="9"></a>
## 3.1 Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import LeaveOneOut, GridSearchCV
from sklearn.metrics import accuracy_score

# Define Logistic Regression model
model = LogisticRegression(max_iter=10000, class_weight='balanced')

# Define parameter grid for Logistic Regression
param_grid = {
    'C': [0.001, 0.01, 0.1, 10],
    'solver': ['sag' , 'liblinear']
}

# Leave-One-Out Cross-Validation (LOOCV)
loo = LeaveOneOut()

# Perform Grid Search using LOOCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=loo, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters and best LOOCV score
best_params_lr = grid_search.best_params_
best_score_lr = grid_search.best_score_
best_model_lr = grid_search.best_estimator_

print(f"Best Parameters: {best_params_lr}")
print(f"Best accuracy score: {best_score_lr}")


In [None]:
import matplotlib.pyplot as plt

# Predict probabilities
y_train_prob = best_model_lr.predict_proba(X_train)[:, 1]  # Probabilities for the positive class

# Plotting the probabilities
plt.figure(figsize=(10, 6))
plt.scatter(range(len(y_train)), y_train_prob, c=y_train, cmap='coolwarm', marker='o')
plt.title('Logistic Regression Predicted Probabilities vs True Labels')
plt.xlabel('Sample Index')
plt.ylabel('Predicted Probability')
plt.colorbar(label='True Labels')
plt.show()


In [None]:
from sklearn.metrics import classification_report

best_model_lr.fit(X_train, y_train)
y_train_pred_lr = best_model_lr.predict(X_train)
train_accuracy_lr = accuracy_score(y_train, y_train_pred_lr)

print("Accuracy:", train_accuracy_lr)
print(classification_report(y_train, y_train_pred_lr))

In [None]:
from sklearn.metrics import classification_report

y_test_pred_lr = best_model_lr.predict(X_test)
test_accuracy_lr = accuracy_score(y_test, y_test_pred_lr)

print("Accuracy:", test_accuracy_lr)
print(classification_report(y_test, y_test_pred_lr))

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

train_cm_lr = confusion_matrix(y_train, y_train_pred_lr)
plt.figure(figsize=(8, 6))

sns.heatmap(train_cm_lr, annot=True, fmt='d', cmap='Blues',
            xticklabels=['NR', 'R'], yticklabels=['NR', 'R'])
plt.title('Confusion Matrix for Training Set')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

<a id="9"></a>
## 3.2 KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import LeaveOneOut, GridSearchCV

# Create the KNN model
model = KNeighborsClassifier()

# Defining the hyperparameter search space
param_grid = {
    'n_neighbors': range(2, 11),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

loo = LeaveOneOut()

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=loo,scoring='accuracy')
grid_search.fit(X_train, y_train)

best_params_knn = grid_search.best_params_
best_score_knn= grid_search.best_score_
best_model_knn= grid_search.best_estimator_

print(f"Best parameters: {best_params_knn}")
print(f"Best accurancy score: {best_score_knn:.4f}")


In [None]:
from sklearn.metrics import classification_report, accuracy_score

best_model_knn.fit(X_train, y_train)
y_train_pred_knn = best_model_knn.predict(X_train)
train_accuracy_knn = accuracy_score(y_train, y_train_pred_knn)

print("Accuracy:", train_accuracy_knn)
print(classification_report(y_train, y_train_pred_knn))

In [None]:
from sklearn.metrics import classification_report

y_test_pred_knn = best_model_knn.predict(X_test)
test_accuracy_knn = accuracy_score(y_test, y_test_pred_knn)

print("Accuracy:", test_accuracy_knn)
print(classification_report(y_test, y_test_pred_knn))

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

train_cm_knn = confusion_matrix(y_train, y_train_pred_knn)
plt.figure(figsize=(8, 6))

sns.heatmap(train_cm_knn, annot=True, fmt='d', cmap='Blues',
            xticklabels=['NR', 'R'], yticklabels=['NR', 'R'])
plt.title('Confusion Matrix for Training Set')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

<a id="9"></a>
## 3.3 Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import LeaveOneOut, GridSearchCV
from sklearn.tree import plot_tree

# Create the decision tree model
model = DecisionTreeClassifier()

# Defining the hyperparameter search space
param_grid = {
    'criterion': ['gini', 'entropy', 'log_loss'],  # Quality function of division
    'splitter': ['best', 'random'],               # Strategy for splitting nodes
    'max_depth': [ 5, 10, 20],               # Maximum tree depth
    'min_samples_split': [ 4, 7, 10],              # Minimum examples needed to split a node
    'min_samples_leaf': [4,7,10],                # Minimum examples needed on a sheet
}

loo = LeaveOneOut()

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=loo, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_params_dt = grid_search.best_params_
best_score_dt= grid_search.best_score_
best_model_dt=grid_search.best_estimator_

print(f"Mejores par√°metros: {best_params_dt}")
print(f"Best accuracy score (LOOCV): {best_score_dt:.4f}")


plt.figure(figsize=(20, 10))
plot_tree(best_model_dt,
          filled=True,
          feature_names=X_train.columns.tolist(),
          class_names=[str(cls) for cls in set(y_train)])
plt.title("Decision Tree")
plt.show()


In [None]:
from sklearn.metrics import classification_report

best_model_dt.fit(X_train, y_train)
y_train_pred_dt = best_model_dt.predict(X_train)
train_accuracy_dt = accuracy_score(y_train, y_train_pred_dt)

print("Accuracy:", train_accuracy_dt)
print(classification_report(y_train, y_train_pred_dt))

In [None]:
from sklearn.metrics import classification_report
y_test_pred_dt = best_model_dt.predict(X_test)
test_accuracy_dt = accuracy_score(y_test, y_test_pred_dt)
print("Accuracy:", test_accuracy_dt)
print(classification_report(y_test, y_test_pred_dt))

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

train_cm_dt = confusion_matrix(y_train, y_train_pred_dt)
plt.figure(figsize=(8, 6))

sns.heatmap(train_cm_dt, annot=True, fmt='d', cmap='Blues',
            xticklabels=['NR', 'R'], yticklabels=['NR', 'R'])
plt.title('Confusion Matrix for Training Set')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

<a id="9"></a>
## 3.4 Random Forest

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import LeaveOneOut, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


# Create the RandomForest model
model = RandomForestClassifier()

# Define the hyperparameter grid for GridSearchCV
param_grid = {
    'n_estimators': [5, 10, 15],  # Number of trees in the forest
    'max_depth': [3, 5, 7],  # Maximum depth of the tree
    'min_samples_split': [2, 4, 6],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [2, 4, 6],  # Minimum number of samples required to be at a leaf node
    'max_features': ['sqrt', 'log2'],  # Number of features to consider for the best split
    'bootstrap': [True]
}

loo = LeaveOneOut()

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=loo, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_params_rf = grid_search.best_params_
best_score_rf = grid_search.best_score_
best_model_rf = grid_search.best_estimator_

print(f"Best parameters: {best_params_rf}")
print(f"Best accuracy score: {best_score_rf:.4f}")


In [None]:
from sklearn.metrics import classification_report, accuracy_score

best_model_rf.fit(X_train, y_train)
y_train_pred_rf = best_model_rf.predict(X_train)
train_accuracy_rf = accuracy_score(y_train, y_train_pred_rf)

print("Accuracy:", train_accuracy_rf)
print(classification_report(y_train, y_train_pred_rf))

In [None]:
from sklearn.metrics import classification_report

y_test_pred_rf = best_model_rf.predict(X_test)
test_accuracy_rf = accuracy_score(y_test, y_test_pred_rf)

print("Accuracy:", test_accuracy_rf)
print(classification_report(y_test, y_test_pred_rf))

<a id="9"></a>
## 3.5 MLP

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import LeaveOneOut, GridSearchCV

# Define the Multilayer Perceptron model first
mlp_model = MLPClassifier()

# Define the parameter grid
param_grid = {
    'hidden_layer_sizes': [(5,), (10,), (15,), (20,)],  # Single and multi-layer configurations
    'activation': ['relu', 'tanh', 'logistic'],  # Activation functions
    'solver': ['adam'],  # Weight optimization
    'learning_rate': ['constant', 'adaptive'],  # Learning rate schedule
    'alpha': [0.0001, 0.001],  # Regularization parameter
}

loo = LeaveOneOut()

grid_search = GridSearchCV(estimator = mlp_model, param_grid = param_grid, cv = loo, scoring = 'accuracy', n_jobs = -1, verbose = 1)
grid_search.fit(X_train, y_train)

best_params_mlp = grid_search.best_params_
best_score_mlp = grid_search.best_score_
best_model_mlp = grid_search.best_estimator_

print("Best Hyperparameters:", best_params_mlp)
print("Best Cross-validated Accuracy:", best_score_mlp)

In [None]:
from sklearn.metrics import classification_report, accuracy_score

best_model_mlp.fit(X_train, y_train)
y_train_pred_mlp = best_model_mlp.predict(X_train)
train_accuracy_mlp = accuracy_score(y_train, y_train_pred_mlp)

print("Accuracy:", train_accuracy_mlp)
print(classification_report(y_train, y_train_pred_mlp))

In [None]:
from sklearn.metrics import classification_report

y_test_pred_mlp = best_model_mlp.predict(X_test)
test_accuracy_mlp = accuracy_score(y_test, y_test_pred_mlp)

print("Accuracy:", test_accuracy_mlp)
print(classification_report(y_test, y_test_pred_mlp))

# 4. Comparision of the results

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneOut
import warnings

warnings.filterwarnings('ignore')

# Define the LeaveOneOut cross-validation object
loo = LeaveOneOut()

# List of models, their predictions, and their training accuracies
models = [
    ('Logistic Regression', best_model_lr, y_test_pred_lr, best_params_lr, train_accuracy_lr),
    ('Decision Trees', best_model_dt, y_test_pred_dt, best_params_dt, train_accuracy_dt),
    ('KNN', best_model_knn, y_test_pred_knn, best_params_knn, train_accuracy_knn),
    ('Random Forest', best_model_rf, y_test_pred_rf, best_params_rf, train_accuracy_rf),
    ('MLP', best_model_mlp, y_test_pred_mlp, best_params_mlp, train_accuracy_mlp)
]

# List to store the results
results = []

# Iterate through the models and calculate the metrics
for model_name, model, y_test_pred, best_params, train_accuracy in models:
    test_accuracy = accuracy_score(y_test, y_test_pred)
    f1 = f1_score(y_test, y_test_pred)  # F1 Score
    cv_score = cross_val_score(model, X_train, y_train, cv=loo).mean()  # Cross-validation score
    validation_error = 1 - cv_score  # Validation error is the complement of CV score

    # Append results to the list
    results.append({
        'Model': model_name,
        'Best Parameters': best_params,
        'Train Accuracy': train_accuracy,
        'Test Accuracy': test_accuracy,
        'F1 Score': f1,
        'CV Score': cv_score,
        'Validation Error': validation_error
    })

# Convert the results to a pandas DataFrame for better readability
results_df = pd.DataFrame(results)

# Create a bar plot with separate bars for train accuracy, test accuracy, f1 score, cv score, and validation error
fig, ax = plt.subplots(figsize=(12, 6))

# Set bar positions for each metric
train_positions = range(len(results_df))
test_positions = [x + 0.2 for x in train_positions]  # Shift the test bars to the right
f1_positions = [x + 0.4 for x in train_positions]    # Shift the f1 score bars further right
cv_positions = [x + 0.6 for x in train_positions]    # Shift the cv score bars further right
validation_error_positions = [x + 0.8 for x in train_positions]  # Shift the Validation Error bars further right

# Plotting the bars
ax.bar(train_positions, results_df['Train Accuracy'], width=0.16, label='Train Accuracy', color='purple')
ax.bar(test_positions, results_df['Test Accuracy'], width=0.16, label='Test Accuracy', color='salmon')
ax.bar(f1_positions, results_df['F1 Score'], width=0.16, label='F1 Score', color='skyblue')
ax.bar(cv_positions, results_df['CV Score'], width=0.16, label='CV Score', color='orange')
ax.bar(validation_error_positions, results_df['Validation Error'], width=0.16, label='Validation Error', color='green')

# Adding labels and title
ax.set_xlabel('Models')
ax.set_ylabel('Scores')
ax.set_title('Comparison of Train, Test Accuracy, F1 Score, CV Score, and Validation Error for Each Model')
ax.set_xticks([x + 0.4 for x in train_positions])  # Set x-axis ticks at the center of the grouped bars
ax.set_xticklabels(results_df['Model'])

# Adding a legend
ax.legend()

# Show the plot
plt.tight_layout()
plt.show()

# Display the results in a DataFrame
results_df


# 5. Test confusion matrix for the best model

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

test_cm_knn = confusion_matrix(y_test, y_test_pred_knn)
plt.figure(figsize=(8, 6))

sns.heatmap(test_cm_knn, annot=True, fmt='d', cmap='Blues',
            xticklabels=['NR', 'R'], yticklabels=['NR', 'R'])
plt.title('Confusion Matrix for Testing Set')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()