In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_validate

In [2]:
df = pd.read_csv('../data/PPMI_sncRNAcounts/counts/ML.csv')
sig_transcripts = pd.read_csv('../data/PPMI_sncRNAcounts/counts/sig_transcripts_df_v2.csv')

In [3]:
df = df[['PATNO', 'COHORT'] + sig_transcripts['transcript'].tolist()]

In [13]:
X = df.drop(['PATNO', 'COHORT'], axis=1)
y = df['COHORT']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [14]:
y_test = y_test.values.ravel()
y_train = y_train.values.ravel()

In [15]:
# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [16]:
# Initialize the classifiers
ada_clf = AdaBoostClassifier()
gb_clf = GradientBoostingClassifier()
rf_clf = RandomForestClassifier(class_weight='balanced')
knn_clf = KNeighborsClassifier()
svm_clf = LinearSVC(class_weight='balanced')

In [17]:
# Evaluate the classifiers using cross-validation
classifiers = [ada_clf, gb_clf, rf_clf, knn_clf, svm_clf]
scoring = {'accuracy': 'accuracy', 'precision': 'precision_weighted', 'recall': 'recall_weighted', 'f1': 'f1_weighted', 'roc auc': 'roc_auc_ovr_weighted'}
results = {}
for clf in classifiers:
    scores = cross_validate(clf, X_train, y_train, cv=10, scoring=scoring, n_jobs=-1)
    results[clf.__class__.__name__] = {'accuracy': scores['test_accuracy'].mean(),
                                       'precision': scores['test_precision'].mean(),
                                       'recall': scores['test_recall'].mean(),
                                       'f1': scores['test_f1'].mean(),
                                       'roc auc': scores['test_roc auc'].mean()}

# Find the best classifier based on F1 score
best_clf = max(results, key=lambda k: results[k]['f1'])
print('Results:')
for clf_name, clf_results in results.items():
    print(clf_name)
    print('Accuracy:', clf_results['accuracy'])
    print('Precision:', clf_results['precision'])
    print('Recall:', clf_results['recall'])
    print('F1:', clf_results['f1'])
    print()
print('Best classifier:', best_clf)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 76, in _cached_call
    return cache[method]
           ~~~~~^^^^^^^^
KeyError: 'predict_proba'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 316, in _score
    y_pred = me

Results:
AdaBoostClassifier
Accuracy: 0.5391176470588235
Precision: 0.5230703286013615
Recall: 0.5391176470588235
F1: 0.5266611438047731

GradientBoostingClassifier
Accuracy: 0.588893557422969
Precision: 0.5734191081578798
Recall: 0.588893557422969
F1: 0.556228177394468

RandomForestClassifier
Accuracy: 0.5934733893557422
Precision: 0.5410127526622349
Recall: 0.5934733893557422
F1: 0.5174006262637791

KNeighborsClassifier
Accuracy: 0.5319747899159665
Precision: 0.5140255642898351
Recall: 0.5319747899159665
F1: 0.5125894302914034

LinearSVC
Accuracy: 0.48457983193277315
Precision: 0.49728192979444225
Recall: 0.48457983193277315
F1: 0.4875607538371628

Best classifier: GradientBoostingClassifier


In [11]:
result_df = pd.DataFrame(results).T
result_df.to_csv('../results/sig_default_classifier_comparison.csv')

In [20]:
# Initialize the classifiers
ada_clf = AdaBoostClassifier(n_estimators=200)
gb_clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=1)
params = {
    'class_weight': 'balanced',
    'n_estimators': 450,
    'max_depth': 134,
    'min_samples_split': 2,
    'min_samples_leaf': 8,
    'bootstrap': False,
}
rf_clf = RandomForestClassifier(**params)
knn_clf = KNeighborsClassifier(n_neighbors=7)
svm_clf = LinearSVC(C=0.001, class_weight='balanced')

In [21]:
# Evaluate the classifiers using cross-validation
classifiers = [ada_clf, gb_clf, rf_clf, knn_clf, svm_clf]
scoring = {'accuracy': 'accuracy', 'precision': 'precision_weighted', 'recall': 'recall_weighted', 'f1': 'f1_weighted', 'roc auc': 'roc_auc_ovr_weighted'}
results = {}
for clf in classifiers:
    scores = cross_validate(clf, X_train, y_train, cv=10, scoring=scoring, n_jobs=-1)
    results[clf.__class__.__name__] = {'accuracy': scores['test_accuracy'].mean(),
                                       'precision': scores['test_precision'].mean(),
                                       'recall': scores['test_recall'].mean(),
                                       'f1': scores['test_f1'].mean(),
                                       'roc auc': scores['test_roc auc'].mean()}
    
# Find the best classifier based on F1 score
best_clf = max(results, key=lambda k: results[k]['f1'])
print('Results:')
for clf_name, clf_results in results.items():
    print(clf_name)
    print('Accuracy:', clf_results['accuracy'])
    print('Precision:', clf_results['precision'])
    print('Recall:', clf_results['recall'])
    print('F1:', clf_results['f1'])
    print()
print('Best classifier:', best_clf)

Results:
AdaBoostClassifier
Accuracy: 0.5592138630600169
Precision: 0.5499969819013368
Recall: 0.5592138630600169
F1: 0.5442191032584646

GradientBoostingClassifier
Accuracy: 0.5994928148774303
Precision: 0.582670641922436
Recall: 0.5994928148774303
F1: 0.5677348958993366

RandomForestClassifier
Accuracy: 0.5663285432516201
Precision: 0.5682847810513522
Recall: 0.5663285432516203
F1: 0.564183911611513

KNeighborsClassifier
Accuracy: 0.534411101718794
Precision: 0.4944836748738278
Recall: 0.534411101718794
F1: 0.49841662873444487

LinearSVC
Accuracy: 0.5248732037193575
Precision: 0.5498785494449502
Recall: 0.5248732037193575
F1: 0.5233044277273813

Best classifier: GradientBoostingClassifier


Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 76, in _cached_call
    return cache[method]
           ~~~~~^^^^^^^^
KeyError: 'predict_proba'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 316, in _score
    y_pred = method_caller(clf, "predict_proba", X)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 78, in _ca

In [22]:
result_df = pd.DataFrame(results).T
result_df.to_csv('../results/sig_tuned_classifier_comparison.csv')