In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_validate

In [2]:
df = pd.read_csv('../data/PPMI_sncRNAcounts/counts/ML.csv')
sig_transcripts = pd.read_csv('../data/PPMI_sncRNAcounts/counts/sig_transcripts_df.csv')
df = df[['PATNO', 'COHORT'] + sig_transcripts['transcript'].tolist()]
df = df[df['COHORT'] != 1]

In [3]:
X = df.drop(['PATNO', 'COHORT'], axis=1)
y = df['COHORT']

In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [5]:
y_test = y_test.values.ravel()
y_train = y_train.values.ravel()

In [6]:
# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
# Initialize the classifiers
ada_clf = AdaBoostClassifier()
gb_clf = GradientBoostingClassifier()
rf_clf = RandomForestClassifier(class_weight='balanced')
knn_clf = KNeighborsClassifier()
svm_clf = LinearSVC(class_weight='balanced')

In [8]:
# Evaluate the classifiers using cross-validation
classifiers = [ada_clf, gb_clf, rf_clf, knn_clf, svm_clf]
scoring = {'accuracy': 'accuracy', 'precision': 'precision_weighted', 'recall': 'recall_weighted', 'f1': 'f1_weighted', 'roc auc': 'roc_auc'}
results = {}
for clf in classifiers:
    scores = cross_validate(clf, X_train, y_train, cv=10, scoring=scoring, n_jobs=-1)
    results[clf.__class__.__name__] = {'accuracy': scores['test_accuracy'].mean(),
                                       'precision': scores['test_precision'].mean(),
                                       'recall': scores['test_recall'].mean(),
                                       'f1': scores['test_f1'].mean(),
                                       'roc auc': scores['test_roc auc'].mean()}
    
# Find the best classifier based on F1 score
best_clf = max(results, key=lambda k: results[k]['f1'])
print('Results:')
for clf_name, clf_results in results.items():
    print(clf_name)
    print('Accuracy:', clf_results['accuracy'])
    print('Precision:', clf_results['precision'])
    print('Recall:', clf_results['recall'])
    print('F1:', clf_results['f1'])
    print()
print('Best classifier:', best_clf)

Results:
AdaBoostClassifier
Accuracy: 0.8086770981507824
Precision: 0.8124943764652212
Recall: 0.8086770981507824
F1: 0.8051230964284988

GradientBoostingClassifier
Accuracy: 0.8082503556187767
Precision: 0.8096300641243432
Recall: 0.8082503556187767
F1: 0.8073416393394875

RandomForestClassifier
Accuracy: 0.8086059743954481
Precision: 0.8119377742021946
Recall: 0.8086059743954481
F1: 0.8056815999379454

KNeighborsClassifier
Accuracy: 0.7874110953058321
Precision: 0.7901321448375225
Recall: 0.7874110953058321
F1: 0.784758327375466

LinearSVC
Accuracy: 0.8220483641536273
Precision: 0.8312638574638171
Recall: 0.8220483641536273
F1: 0.8225460527046824

Best classifier: LinearSVC




In [9]:
result_df = pd.DataFrame(results).T
result_df.to_csv('../results/sig_prodromal_healthy_default_classifier_comparison.csv')

In [10]:
result_df

Unnamed: 0,accuracy,precision,recall,f1,roc auc
AdaBoostClassifier,0.808677,0.812494,0.808677,0.805123,0.8519
GradientBoostingClassifier,0.80825,0.80963,0.80825,0.807342,0.852862
RandomForestClassifier,0.808606,0.811938,0.808606,0.805682,0.884089
KNeighborsClassifier,0.787411,0.790132,0.787411,0.784758,0.822264
LinearSVC,0.822048,0.831264,0.822048,0.822546,0.865705


In [11]:
# Initialize the classifiers
ada_clf = AdaBoostClassifier(n_estimators=200)
gb_clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=1)
params = {
    'class_weight': 'balanced',
    'n_estimators': 450,
    'max_depth': 134,
    'min_samples_split': 2,
    'min_samples_leaf': 8,
    'bootstrap': False,
}
rf_clf = RandomForestClassifier(**params)
knn_clf = KNeighborsClassifier(n_neighbors=7)
svm_clf = LinearSVC(C=0.001, class_weight='balanced')

In [12]:
# Evaluate the classifiers using cross-validation
classifiers = [ada_clf, gb_clf, rf_clf, knn_clf, svm_clf]
scoring = {'accuracy': 'accuracy', 'precision': 'precision_weighted', 'recall': 'recall_weighted', 'f1': 'f1_weighted', 'roc auc': 'roc_auc_ovr_weighted'}
results = {}
for clf in classifiers:
    scores = cross_validate(clf, X_train, y_train, cv=10, scoring=scoring, n_jobs=-1)
    results[clf.__class__.__name__] = {'accuracy': scores['test_accuracy'].mean(),
                                       'precision': scores['test_precision'].mean(),
                                       'recall': scores['test_recall'].mean(),
                                       'f1': scores['test_f1'].mean(),
                                       'roc auc': scores['test_roc auc'].mean()}
    
# Find the best classifier based on F1 score
best_clf = max(results, key=lambda k: results[k]['f1'])
print('Results:')
for clf_name, clf_results in results.items():
    print(clf_name)
    print('Accuracy:', clf_results['accuracy'])
    print('Precision:', clf_results['precision'])
    print('Recall:', clf_results['recall'])
    print('F1:', clf_results['f1'])
    print()
print('Best classifier:', best_clf)

Results:
AdaBoostClassifier
Accuracy: 0.779018492176387
Precision: 0.784038924339604
Recall: 0.779018492176387
F1: 0.7759079644741329

GradientBoostingClassifier
Accuracy: 0.8058321479374111
Precision: 0.8102014944883434
Recall: 0.8058321479374111
F1: 0.8034584367954661

RandomForestClassifier
Accuracy: 0.808819345661451
Precision: 0.8177167220638077
Recall: 0.808819345661451
F1: 0.8093146709507751

KNeighborsClassifier
Accuracy: 0.7926742532005691
Precision: 0.794907271052008
Recall: 0.7926742532005691
F1: 0.7894887151564411

LinearSVC
Accuracy: 0.8061877667140825
Precision: 0.8121219349747415
Recall: 0.8061877667140825
F1: 0.806536281435901

Best classifier: RandomForestClassifier


Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 76, in _cached_call
    return cache[method]
           ~~~~~^^^^^^^^
KeyError: 'predict_proba'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 316, in _score
    y_pred = method_caller(clf, "predict_proba", X)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 78, in _ca

In [13]:
result_df_2 = pd.DataFrame(results).T
result_df_2.to_csv('../results/sig_prodromal_healthy_tuned_classifier_comparison.csv')
result_df_2

Unnamed: 0,accuracy,precision,recall,f1,roc auc
AdaBoostClassifier,0.779018,0.784039,0.779018,0.775908,0.825366
GradientBoostingClassifier,0.805832,0.810201,0.805832,0.803458,0.8606
RandomForestClassifier,0.808819,0.817717,0.808819,0.809315,0.888484
KNeighborsClassifier,0.792674,0.794907,0.792674,0.789489,0.831296
LinearSVC,0.806188,0.812122,0.806188,0.806536,
