In [1]:
import numpy as np
import pandas as pd

from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, GroupKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [6]:
def evaluate_with_scalers(model, df_data, cv_splits=5, random_seed=42):
    """
    Evaluate a given model using different scalers.

    Parameters:
    - model: The machine learning model to evaluate.
    - df_data: The dataframe containing the data.
    - cv_splits: Number of splits for GroupKFold cross-validation. Default is 20.
    - random_seed: Random seed for shuffling. Default is 42.

    Returns:
    - None. Prints out the evaluation metrics for each scaler.
    """
    
    features = df_data.columns[3:-1]
    label = df_data.columns[-1]

    scalers = {
        "StandardScaler": StandardScaler(),
        "MinMaxScaler": MinMaxScaler(),
        "RobustScaler": RobustScaler()
    }

    cv = GroupKFold(n_splits=cv_splits)

    for name, scaler in scalers.items():

        X_data = df_data[features].values
        y_data = df_data[label].values
        groups = df_data["image_id"].values

        X_scaled = scaler.fit_transform(X_data)

        X_scaled, y_data, groups = shuffle(X_scaled, y_data, groups, random_state=random_seed)

        scores = cross_validate(model, X_scaled, y_data, groups=groups, cv=cv, 
                                scoring=['precision', 'recall', 'f1', 'accuracy'], n_jobs=-1)

        avg_precision = np.mean(scores['test_precision'])
        avg_recall = np.mean(scores['test_recall'])
        avg_fscore = np.mean(scores['test_f1'])
        avg_accuracy = np.mean(scores['test_accuracy'])

        print(f"Using {name}")
        print(f"Average Precision: {avg_precision:.4f}")
        print(f"Average Recall: {avg_recall:.4f}")
        print(f"Average F1-Score: {avg_fscore:.4f}")
        print(f"Average Accuracy: {avg_accuracy:.4f}")
        print("----------------------------")

In [3]:
df_data = pd.read_pickle('df_data.pkl')

In [4]:
feature_columns = list(range(0, 438))
duplicates = df_data.duplicated(subset=feature_columns, keep=False)
num_duplicates = duplicates.sum()
print(f'Number of duplicate rows: {num_duplicates}')

missing_count = df_data.isnull().sum().sum()
print(f'Number of missing values: {missing_count}')

Number of duplicate rows: 46
Number of missing values: 0


In [5]:
df_data = df_data[~duplicates]
duplicates = df_data.duplicated(subset=feature_columns, keep=False)
num_duplicates = duplicates.sum()
print(f'Number of duplicate rows: {num_duplicates}')

Number of duplicate rows: 0


In [7]:
lr = LogisticRegression(solver='newton-cholesky', max_iter=200, n_jobs=-1)
evaluate_with_scalers(lr, df_data)

Using StandardScaler
Average Precision: 0.8120
Average Recall: 0.9056
Average F1-Score: 0.8560
Average Accuracy: 0.8917
----------------------------
Using MinMaxScaler
Average Precision: 0.8121
Average Recall: 0.8889
Average F1-Score: 0.8484
Average Accuracy: 0.8869
----------------------------
Using RobustScaler
Average Precision: 0.8123
Average Recall: 0.9053
Average F1-Score: 0.8560
Average Accuracy: 0.8918
----------------------------


In [7]:
rf = RandomForestClassifier(n_jobs=-1)
evaluate_with_scalers(rf, df_data)

Using StandardScaler
Average Precision: 0.5260
Average Recall: 0.9685
Average F1-Score: 0.6625
Average Accuracy: 0.6255
----------------------------
Using MinMaxScaler
Average Precision: 0.5215
Average Recall: 0.9689
Average F1-Score: 0.6597
Average Accuracy: 0.6223
----------------------------
Using RobustScaler
Average Precision: 0.5218
Average Recall: 0.9680
Average F1-Score: 0.6594
Average Accuracy: 0.6212
----------------------------


In [11]:
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
evaluate_with_scalers(knn, df_data)

Using StandardScaler
Average Precision: 0.5680
Average Recall: 0.8559
Average F1-Score: 0.6658
Average Accuracy: 0.6933
----------------------------
Using MinMaxScaler
Average Precision: 0.5779
Average Recall: 0.8787
Average F1-Score: 0.6778
Average Accuracy: 0.6979
----------------------------
Using RobustScaler
Average Precision: 0.5524
Average Recall: 0.8497
Average F1-Score: 0.6528
Average Accuracy: 0.6778
----------------------------
