In [1]:
import pandas as pd
import numpy as np
from scipy.io import loadmat
from sklearn.model_selection import train_test_split
import os
# Metrics
from sklearn.metrics import accuracy_score
# Models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
# Preprocessing
from sklearn.preprocessing import StandardScaler

In [2]:
def benchmark_model(train, test, model, class_col, dataset_name, i):
    # Splitting and scaling the data
    X_train_raw = train.drop(class_col, axis=1)
    scaler = StandardScaler().fit(X_train_raw)
    X_train_scaled = scaler.transform(X_train_raw)
    y_train = train[class_col].values

    X_test_raw = test.drop(class_col, axis=1)
    X_test_scaled = scaler.transform(X_test_raw)
    y_test = test[class_col].values

    # Create dataframes for scaled data
    if i == 0:
        # Create dataframes for scaled data with class_col at the beginning
        df_train_scaled = pd.concat([pd.DataFrame(y_train, columns=[class_col]), pd.DataFrame(X_train_scaled, columns=X_train_raw.columns)], axis=1)
        df_test_scaled = pd.concat([pd.DataFrame(y_test, columns=[class_col]), pd.DataFrame(X_test_scaled, columns=X_test_raw.columns)], axis=1)
        os.makedirs("split_data", exist_ok=True)
        # Save the scaled train and test data as .csv files
        df_train_scaled.to_csv(os.path.join("split_data", f"{dataset_name}_train.csv"), index=False)
        df_test_scaled.to_csv(os.path.join("split_data", f"{dataset_name}_test.csv"), index=False)
        
    # Training the model
    model.fit(X_train_scaled, y_train)

    # Predictions for test set
    test_predictions = model.predict(X_test_scaled)
    test_accuracy = accuracy_score(y_test, test_predictions)
    # Predictions for training set
    train_predictions = model.predict(X_train_scaled)
    train_accuracy = accuracy_score(y_train, train_predictions)
    return train_accuracy, test_accuracy

def stratified_split(df, class_col, test_size=0.3, random_state=42):
    train_dfs = []
    test_dfs = []
    for label in df[class_col].unique():
        class_subset = df[df[class_col] == label]
        train_subset, test_subset = train_test_split(class_subset, test_size=test_size, random_state=random_state)
        train_dfs.append(train_subset)
        test_dfs.append(test_subset)
    train_df = pd.concat(train_dfs, axis=0).sample(frac=1, random_state=random_state).reset_index(drop=True) # Shuffling
    test_df = pd.concat(test_dfs, axis=0).sample(frac=1, random_state=random_state).reset_index(drop=True)  # Shuffling
    return train_df, test_df

def stratified_well_split(df, class_col, well_col, test_size=0.3, random_state=None):
    # This is a special case for the LIVECell dataset
    train_dfs = []
    test_dfs = []
    
    # Loop over each unique class
    for label in df[class_col].unique():
        class_subset = df[df[class_col] == label]
        
        # Get all unique wells for the class and split them into train and test wells
        unique_wells = class_subset[well_col].unique()
        train_wells, test_wells = train_test_split(unique_wells, test_size=test_size, random_state=random_state)
        
        # For each well in train/test wells, get all rows corresponding to that well
        # from the original dataframe and add them to train/test dataframe
        train_df_class = class_subset[class_subset[well_col].isin(train_wells)]
        test_df_class = class_subset[class_subset[well_col].isin(test_wells)]
        
        train_dfs.append(train_df_class)
        test_dfs.append(test_df_class)
        
    train_df = pd.concat(train_dfs, axis=0).sample(frac=1, random_state=random_state).reset_index(drop=True)  # Shuffling
    test_df = pd.concat(test_dfs, axis=0).sample(frac=1, random_state=random_state).reset_index(drop=True)  # Shuffling
    
    return train_df, test_df

def get_df_info(df, class_col):
    df_info = {}
    num_rows = len(df)
    num_columns = len(df.columns)
    num_unique_classes = df[class_col].nunique()  # nunique() gives number of unique values in the column
    
    # Storing the extracted information in a dictionary
    df_info = {
        "Number of Rows": num_rows,
        "Number of Columns": num_columns,
        "Number of Unique Classes": num_unique_classes
    }
    return df_info

In [None]:
dataset_names = ['Iris', 'Whitefish', 'JUMP_CP1', 'HumanActivity']
paths = [
    os.path.join("data", "IRIS_DATA.xlsx"),
    os.path.join("data", "Fish_12classes.xlsx"),
    os.path.join("data", "jump_data_benchmark.xlsx"),
    os.path.join("data", "Human_activity.xlsx")
]

drop_list = ['Dataset order', 'Primary ID', 'Primary ID', 'Primary ID']
class_cols = ['Species', '$ClassID', '$ClassID', '$ClassID']

models = [
    ('SVM', SVC(random_state=42, decision_function_shape='ovr')),
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('Decision Tree', DecisionTreeClassifier(random_state=42)),
    ('MLP Classifier', MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)),
    ('KNN', KNeighborsClassifier())
]
model_names = [name for name, _ in models]
acc_score_train_df = pd.DataFrame(columns=dataset_names, index=model_names)
acc_score_test_df = pd.DataFrame(columns=dataset_names, index=model_names)
for dataset_name, path, drop, class_col in zip(dataset_names, paths, drop_list, class_cols):
    xls = pd.read_excel(path, sheet_name=None, engine='openpyxl')
    df = pd.concat(xls.values(), ignore_index=True)
    df.drop(drop, axis=1, inplace=True)
    train, test = stratified_split(df, class_col, test_size=0.30, random_state=42)
    print(dataset_name)
    lst_train_acc = []
    lst_test_acc = []
    i = 0
    for name, model in models:
        train_acc, test_acc = benchmark_model(train, test, model, class_col=class_col, dataset_name=dataset_name, i=i)
        lst_train_acc.append(train_acc)
        lst_test_acc.append(test_acc)
        print(f"{name}")
        print(f"Training Accuracy: {train_acc:.4f}")
        print(f"Test Accuracy: {test_acc:.4f}")
        i += 1
    acc_score_train_df[dataset_name] = lst_train_acc
    acc_score_test_df[dataset_name] = lst_test_acc

In [None]:
# Load data
gt_mat_data = loadmat(os.path.join('data', 'SalinasA_gt.mat'))
mat_data = loadmat(os.path.join('data', 'SalinasA_corrected.mat'))
df = pd.DataFrame()

gt_data = gt_mat_data['salinasA_gt']
data = mat_data['salinasA_corrected']

unique_pixels, indices = np.unique(data.reshape(-1, data.shape[2]), axis=0, return_index=True)

# Extract the x, y indices from the flat indices
y, x = np.divmod(indices, data.shape[1])

# Initialize masks for training and test sets
train_mask = np.zeros_like(data[:,:,0], dtype=bool)
test_mask = np.zeros_like(data[:,:,0], dtype=bool)

# Assign pixels above the diagonal to train_mask, below the diagonal to test_mask
for i in range(len(x)):
    if x[i] < y[i]:  # below the diagonal
        test_mask[y[i], x[i]] = True
    else:  # above the diagonal
        train_mask[y[i], x[i]] = True

# Extract training and test sets based on masks
train_set = np.concatenate([np.expand_dims(gt_data[train_mask], 1), data[train_mask]], axis=1)
test_set = np.concatenate([np.expand_dims(gt_data[test_mask], 1), data[test_mask]], axis=1)
print(train_set.shape)

df_train = pd.DataFrame(train_set)
df_train = df_train.rename(columns={0: '$ClassID'})

df_test = pd.DataFrame(test_set)
df_test = df_test.rename(columns={0: '$ClassID'})
class_col = '$ClassID'
df_test = df_test[df_test['$ClassID'] != 0]
df_info = get_df_info(df_test, class_col)
print(df_info)
df_train = df_train[df_train['$ClassID'] != 0]
df_info = get_df_info(df_train, class_col)
print(df_info)

In [None]:
class_col = '$ClassID'
# i is to only save the train/test splits the first loop
i = 0
lst_train_acc = []
lst_test_acc = []
dataset_name="SalinasA"
print(dataset_name)
for name, model in models:
    train_acc, test_acc = benchmark_model(df_train, df_test, model, class_col=class_col, dataset_name=dataset_name, i=i)
    print(f"{name}")
    print(f"Training Accuracy: {train_acc:.4f}")
    print(f"Test Accuracy: {test_acc:.4f}")
    i += 1
    lst_train_acc.append(train_acc)
    lst_test_acc.append(test_acc)
acc_score_test_df['Salinas_A'] = lst_test_acc
acc_score_train_df['Salinas_A'] = lst_train_acc


In [None]:
## Download from: https://www.kaggle.com/datasets/brunogrisci/breast-cancer-gene-expression-cumida
dataset_names = ['BreastCancer']
paths = [os.path.join("data", "Breast_GSE45827.csv")]
drop_list = ['samples']
class_cols = ['type']
lst_test_acc = []
lst_train_acc = []
for dataset_name, path, drop, class_col in zip(dataset_names, paths, drop_list, class_cols):
    df = pd.read_csv(path)
    df.drop(drop, axis=1, inplace=True)
    train, test = stratified_split(df, class_col, test_size=0.30, random_state=42)
    for name, model in models:
        train_acc, test_acc = benchmark_model(train, test, model, class_col=class_col, dataset_name=dataset_name, i=i)
        print(f"{name}")
        print(f"Training Accuracy: {train_acc:.4f}")
        print(f"Test Accuracy: {test_acc:.4f}")
        i += 1
        lst_train_acc.append(train_acc)
        lst_test_acc.append(test_acc)
acc_score_test_df['Breast_Cancer'] = lst_test_acc
acc_score_train_df['Breast_Cancer'] = lst_train_acc

In [None]:
dataset_names = ['LIVECell']
paths = [os.path.join("data", "livecell_medians.csv")]
drop_list = ['Plate', 'Timepoint', 'Replicate_ID', 'Well', 'Unnamed: 0']
class_cols = ['Cell_type']
well_col = 'Well'
lc_train = []
lc_test = []
for dataset_name, path, drop, class_col in zip(dataset_names, paths, drop_list, class_cols):
    df = pd.read_csv(path)
    df = df[df['Timepoint'] != '00d00h00m']
    train, test = stratified_well_split(df, class_col, well_col=well_col, test_size=0.30, random_state=42)
    train.drop(drop_list, axis=1, inplace=True)
    test.drop(drop_list, axis=1, inplace=True)
    print(dataset_name)
    i = 0
    lst_train_acc = []
    lst_test_acc = []
    for name, model in models:
        train_acc, test_acc = benchmark_model(train, test, model, class_col=class_col, dataset_name=dataset_name, i=i)
        print(f"{name}")
        print(f"Training Accuracy: {train_acc:.4f}")
        print(f"Test Accuracy: {test_acc:.4f}")
        i += 1
        lst_train_acc.append(train_acc)
        lst_test_acc.append(test_acc)
acc_score_test_df['LIVECell'] = lst_test_acc
acc_score_train_df['LIVECell'] = lst_train_acc

In [None]:
def convert_to_percentage(val):
    """Convert decimal to percentage as float or int."""
    percent = round(val * 100, 1)
    return int(percent) if percent == 100.0 else percent
# Apply the function to each element of the DataFrame
df_test = acc_score_test_df.copy()
df_test = df_test.applymap(convert_to_percentage)
df_test