In [45]:
import pandas as pd
import os

from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.utils import resample

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [46]:
data_path = '/content/drive/My Drive/266 Assignments/266 Final Project'
files = os.listdir(data_path)
files = [x for x in files if '.csv' in x]

filt_df = pd.read_csv(os.path.join(data_path, files[files.index('cleaned_data.csv')]))

In [47]:
#resample the data to balance it exactly 50-50 HC-PD

#separate the classes
class_1 = filt_df[filt_df['diagnosis'] == 1]
class_0 = filt_df[filt_df['diagnosis'] == 0]

#downsample the majority class
class_0_down = resample(class_0, replace=False, n_samples = len(class_1), random_state=35)

# Combine minority class with downsampled majority class
balanced_data = pd.concat([class_1, class_0_down])
len(balanced_data)

2940

In [48]:
#split the data
# Split the data into training, validation, and test sets
X = balanced_data['key_sequence']
y = balanced_data['diagnosis']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=35)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=35)

#convert series of lists to list
X_train = list(X_train)
X_val = list(X_val)
X_test = list(X_test)

y_train = y_train.to_list()
y_val = y_val.to_list()
y_test = y_test.to_list()

In [49]:
# Create the baseline model
baseline_model = DummyClassifier(strategy='constant', constant=0)

# Fit the model to the training data
baseline_model.fit(X_train, y_train)

# Make predictions
y_pred = baseline_model.predict(X_test)

# get probs
y_scores = baseline_model.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_scores)

print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1-Score: ", f1)
print("AUC: ", auc)

Accuracy:  0.5306122448979592
Precision:  0.0
Recall:  0.0
F1-Score:  0.0
AUC:  0.5


  _warn_prf(average, modifier, msg_start, len(result))


In [50]:
import csv
from datetime import datetime

name = "baseline"

def write_metrics_to_csv(filename, metrics):
    try:
        with open(filename, 'a', newline='') as file:
            writer = csv.DictWriter(file, fieldnames=metrics.keys())
            if file.tell() == 0:
                writer.writeheader()  # Write header if the file is empty

            writer.writerow(metrics)
    except IOError:
        print("Error writing to file")

timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

metrics = {
    "model name": name,
    "timestamp": timestamp,
    "accuracy": accuracy,
    "precision": precision,
    "recall": recall,
    "f1": f1,
    "auc": auc
}

# Write the metrics to a CSV file
write_metrics_to_csv(os.path.join(data_path, "metrics.csv"), metrics)

# why the accuracy is not exactly 0.5:


The accuracy of a classifier is calculated as the ratio of correctly predicted samples to the total number of samples. In the case of a balanced dataset (50-50 split between two classes), if you set a constant prediction using DummyClassifier(strategy='constant', constant=0), you would expect an accuracy of 0.5 if the constant you're predicting matches the majority class.

However, achieving an accuracy of exactly 0.5 might not happen due to randomness in the train-test split and the way the classifier handles ties or borderline cases. The DummyClassifier in scikit-learn randomly shuffles the data before splitting, and it might end up predicting the constant label in a manner that results in a slightly lower accuracy than the expected 0.5.

Additionally, if the dataset size is small, this randomness can have a more significant impact on the accuracy. For example, if you have only a few samples, the actual percentage of correctly predicted samples might slightly differ from the expected 50% due to the nature of random sampling.

To confirm this behavior and better understand why you're observing an accuracy of 0.467 instead of the expected 0.5, you might want to perform multiple runs of the experiment, changing the random seed or using cross-validation to get a more robust estimate of the accuracy. This can help you see the variance in the accuracy scores and confirm that the classifier is indeed predicting the constant label '0' as expected, albeit with a slightly lower accuracy due to random fluctuations in the dataset splits.