In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_validate

In [3]:
df = pd.read_csv('../data/PPMI_sncRNAcounts/counts/ML.csv')
df = df[df['COHORT'] != 1]

In [4]:
X = df.drop(['PATNO', 'COHORT'], axis=1)
y = df['COHORT']

In [11]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [12]:
y_test = y_test.values.ravel()
y_train = y_train.values.ravel()

In [13]:
# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
# Initialize the classifiers
ada_clf = AdaBoostClassifier()
gb_clf = GradientBoostingClassifier()
rf_clf = RandomForestClassifier(class_weight='balanced')
knn_clf = KNeighborsClassifier()
svm_clf = LinearSVC(class_weight='balanced')

In [8]:
# Evaluate the classifiers using cross-validation
classifiers = [ada_clf, gb_clf, rf_clf, knn_clf, svm_clf]
scoring = {'accuracy': 'accuracy', 'precision': 'precision_weighted', 'recall': 'recall_weighted', 'f1': 'f1_weighted', 'roc auc': 'roc_auc'}
results = {}
for clf in classifiers:
    scores = cross_validate(clf, X_train, y_train, cv=10, scoring=scoring, n_jobs=-1)
    results[clf.__class__.__name__] = {'accuracy': scores['test_accuracy'].mean(),
                                       'precision': scores['test_precision'].mean(),
                                       'recall': scores['test_recall'].mean(),
                                       'f1': scores['test_f1'].mean(),
                                       'roc auc': scores['test_roc auc'].mean()}
    
# Find the best classifier based on F1 score
best_clf = max(results, key=lambda k: results[k]['f1'])
print('Results:')
for clf_name, clf_results in results.items():
    print(clf_name)
    print('Accuracy:', clf_results['accuracy'])
    print('Precision:', clf_results['precision'])
    print('Recall:', clf_results['recall'])
    print('F1:', clf_results['f1'])
    print()
print('Best classifier:', best_clf)

Results:
AdaBoostClassifier
Accuracy: 0.8458036984352774
Precision: 0.8517529740132485
Recall: 0.8458036984352774
F1: 0.8447193092429351

GradientBoostingClassifier
Accuracy: 0.843172119487909
Precision: 0.8484974093697003
Recall: 0.843172119487909
F1: 0.842611571186433

RandomForestClassifier
Accuracy: 0.808534850640114
Precision: 0.8165832422322599
Recall: 0.808534850640114
F1: 0.8010557772798276

KNeighborsClassifier
Accuracy: 0.6810810810810811
Precision: 0.7295368427423943
Recall: 0.6810810810810811
F1: 0.6248847286202863

LinearSVC
Accuracy: 0.7660028449502134
Precision: 0.808962329076409
Recall: 0.7660028449502134
F1: 0.7671816614499541

Best classifier: AdaBoostClassifier


In [9]:
result_df = pd.DataFrame(results).T
result_df.to_csv('../results/prodromal_healthy_default_classifier_comparison.csv')

In [10]:
result_df

Unnamed: 0,accuracy,precision,recall,f1,roc auc
AdaBoostClassifier,0.845804,0.851753,0.845804,0.844719,0.896554
GradientBoostingClassifier,0.843172,0.848497,0.843172,0.842612,0.925082
RandomForestClassifier,0.808535,0.816583,0.808535,0.801056,0.88722
KNeighborsClassifier,0.681081,0.729537,0.681081,0.624885,0.68947
LinearSVC,0.766003,0.808962,0.766003,0.767182,0.852112


In [14]:
# Initialize the classifiers
ada_clf = AdaBoostClassifier(n_estimators=200)
gb_clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=1)
params = {
    'class_weight': 'balanced',
    'n_estimators': 450,
    'max_depth': 134,
    'min_samples_split': 2,
    'min_samples_leaf': 8,
    'bootstrap': False,
}
rf_clf = RandomForestClassifier(**params)
knn_clf = KNeighborsClassifier(n_neighbors=7)
svm_clf = LinearSVC(C=0.001, class_weight='balanced')

In [15]:
# Evaluate the classifiers using cross-validation
classifiers = [ada_clf, gb_clf, rf_clf, knn_clf, svm_clf]
scoring = {'accuracy': 'accuracy', 'precision': 'precision_weighted', 'recall': 'recall_weighted', 'f1': 'f1_weighted', 'roc auc': 'roc_auc_ovr_weighted'}
results = {}
for clf in classifiers:
    scores = cross_validate(clf, X_train, y_train, cv=10, scoring=scoring, n_jobs=-1)
    results[clf.__class__.__name__] = {'accuracy': scores['test_accuracy'].mean(),
                                       'precision': scores['test_precision'].mean(),
                                       'recall': scores['test_recall'].mean(),
                                       'f1': scores['test_f1'].mean(),
                                       'roc auc': scores['test_roc auc'].mean()}
    
# Find the best classifier based on F1 score
best_clf = max(results, key=lambda k: results[k]['f1'])
print('Results:')
for clf_name, clf_results in results.items():
    print(clf_name)
    print('Accuracy:', clf_results['accuracy'])
    print('Precision:', clf_results['precision'])
    print('Recall:', clf_results['recall'])
    print('F1:', clf_results['f1'])
    print()
print('Best classifier:', best_clf)

Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 76, in _cached_call
    return cache[method]
           ~~~~~^^^^^^^^
KeyError: 'predict_proba'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 316, in _score
    y_pred = method_caller(clf, "predict_proba", X)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 78, in _ca

Results:
AdaBoostClassifier
Accuracy: 0.854054054054054
Precision: 0.8568081086815775
Recall: 0.854054054054054
F1: 0.8534190022010322

GradientBoostingClassifier
Accuracy: 0.8591749644381224
Precision: 0.8607960122738356
Recall: 0.8591749644381224
F1: 0.8591875612082394

RandomForestClassifier
Accuracy: 0.8432432432432432
Precision: 0.8472134073587163
Recall: 0.8432432432432432
F1: 0.8420417483688833

KNeighborsClassifier
Accuracy: 0.680796586059744
Precision: 0.7163802089458735
Recall: 0.680796586059744
F1: 0.6179875621749155

LinearSVC
Accuracy: 0.8140825035561878
Precision: 0.8323417653648223
Recall: 0.8140825035561878
F1: 0.8150878342293618

Best classifier: GradientBoostingClassifier


Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 76, in _cached_call
    return cache[method]
           ~~~~~^^^^^^^^
KeyError: 'predict_proba'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 316, in _score
    y_pred = method_caller(clf, "predict_proba", X)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 78, in _ca

In [16]:
result_df_2 = pd.DataFrame(results).T
result_df_2.to_csv('../results/prodromal_healthy_tuned_classifier_comparison.csv')
result_df_2

Unnamed: 0,accuracy,precision,recall,f1,roc auc
AdaBoostClassifier,0.854054,0.856808,0.854054,0.853419,0.929186
GradientBoostingClassifier,0.859175,0.860796,0.859175,0.859188,0.921856
RandomForestClassifier,0.843243,0.847213,0.843243,0.842042,0.906856
KNeighborsClassifier,0.680797,0.71638,0.680797,0.617988,0.752027
LinearSVC,0.814083,0.832342,0.814083,0.815088,
