In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_validate

In [2]:
X = pd.read_csv('../data/PPMI_sncRNAcounts/counts/X.csv')
y = pd.read_csv('../data/PPMI_sncRNAcounts/counts/y.csv')

In [3]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [4]:
y_test = y_test.values.ravel()
y_train = y_train.values.ravel()

In [5]:
# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
# Initialize the classifiers
ada_clf = AdaBoostClassifier()
gb_clf = GradientBoostingClassifier()
rf_clf = RandomForestClassifier(class_weight='balanced')
knn_clf = KNeighborsClassifier()
svm_clf = LinearSVC(class_weight='balanced')

In [7]:
# Evaluate the classifiers using cross-validation
classifiers = [ada_clf, gb_clf, rf_clf, knn_clf, svm_clf]
scoring = {'accuracy': 'accuracy', 'precision': 'precision_weighted', 'recall': 'recall_weighted', 'f1': 'f1_weighted', 'roc auc': 'roc_auc_ovr_weighted'}
results = {}
for clf in classifiers:
    scores = cross_validate(clf, X_train, y_train, cv=10, scoring=scoring, n_jobs=-1)
    results[clf.__class__.__name__] = {'accuracy': scores['test_accuracy'].mean(),
                                       'precision': scores['test_precision'].mean(),
                                       'recall': scores['test_recall'].mean(),
                                       'f1': scores['test_f1'].mean(),
                                       'roc auc': scores['test_roc auc'].mean()}
    
# Find the best classifier based on F1 score
best_clf = max(results, key=lambda k: results[k]['f1'])
print('Results:')
for clf_name, clf_results in results.items():
    print(clf_name)
    print('Accuracy:', clf_results['accuracy'])
    print('Precision:', clf_results['precision'])
    print('Recall:', clf_results['recall'])
    print('F1:', clf_results['f1'])
    print()
print('Best classifier:', best_clf)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 76, in _cached_call
    return cache[method]
           ~~~~~^^^^^^^^
KeyError: 'predict_proba'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs

Results:
AdaBoostClassifier
Accuracy: 0.5579551820728291
Precision: 0.5366836037128373
Recall: 0.5579551820728291
F1: 0.5383995500654042

GradientBoostingClassifier
Accuracy: 0.5983613445378151
Precision: 0.5763559085885872
Recall: 0.5983613445378151
F1: 0.5489807887982193

RandomForestClassifier
Accuracy: 0.5758403361344537
Precision: 0.5096146573589831
Recall: 0.5758403361344537
F1: 0.4791817773592452

KNeighborsClassifier
Accuracy: 0.5094817927170868
Precision: 0.4722114835182567
Recall: 0.5094817927170868
F1: 0.46581584095411566

LinearSVC
Accuracy: 0.33057422969187683
Precision: 0.45951247804248013
Recall: 0.33057422969187683
F1: 0.2479573971098044

Best classifier: GradientBoostingClassifier


Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 76, in _cached_call
    return cache[method]
           ~~~~~^^^^^^^^
KeyError: 'predict_proba'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 316, in _score
    y_pred = method_caller(clf, "predict_proba", X)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 78, in _ca

In [8]:
result_df = pd.DataFrame(results).T
result_df.to_csv('../results/default_classifier_comparison.csv')

In [9]:
# Initialize the classifiers
ada_clf = AdaBoostClassifier(n_estimators=200)
gb_clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=1)
params = {
    'class_weight': 'balanced',
    'n_estimators': 450,
    'max_depth': 134,
    'min_samples_split': 2,
    'min_samples_leaf': 8,
    'bootstrap': False,
}
rf_clf = RandomForestClassifier(**params)
knn_clf = KNeighborsClassifier(n_neighbors=7)
svm_clf = LinearSVC(C=0.001, class_weight='balanced')

In [10]:
# Evaluate the classifiers using cross-validation
classifiers = [ada_clf, gb_clf, rf_clf, knn_clf, svm_clf]
scoring = {'accuracy': 'accuracy', 'precision': 'precision_weighted', 'recall': 'recall_weighted', 'f1': 'f1_weighted', 'roc auc': 'roc_auc_ovr_weighted'}
results = {}
for clf in classifiers:
    scores = cross_validate(clf, X_train, y_train, cv=10, scoring=scoring, n_jobs=-1)
    results[clf.__class__.__name__] = {'accuracy': scores['test_accuracy'].mean(),
                                       'precision': scores['test_precision'].mean(),
                                       'recall': scores['test_recall'].mean(),
                                       'f1': scores['test_f1'].mean(),
                                       'roc auc': scores['test_roc auc'].mean()}
    
# Find the best classifier based on F1 score
best_clf = max(results, key=lambda k: results[k]['f1'])
print('Results:')
for clf_name, clf_results in results.items():
    print(clf_name)
    print('Accuracy:', clf_results['accuracy'])
    print('Precision:', clf_results['precision'])
    print('Recall:', clf_results['recall'])
    print('F1:', clf_results['f1'])
    print()
print('Best classifier:', best_clf)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 76, in _cached_call
    return cache[method]
           ~~~~~^^^^^^^^
KeyError: 'predict_proba'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 316, in _score
    y_pred = method_caller(clf, "predict_proba", X)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew

Results:
AdaBoostClassifier
Accuracy: 0.5983446041138349
Precision: 0.5919771816285962
Recall: 0.5983446041138349
F1: 0.5504018570418608

GradientBoostingClassifier
Accuracy: 0.6054240631163709
Precision: 0.5619198668085102
Recall: 0.6054240631163709
F1: 0.5560309735652421

RandomForestClassifier
Accuracy: 0.5817413355874894
Precision: 0.5577300961198787
Recall: 0.5817413355874894
F1: 0.5573920972944484

KNeighborsClassifier
Accuracy: 0.5580515638207946
Precision: 0.5101277725373364
Recall: 0.5580515638207946
F1: 0.45720365196796064

LinearSVC
Accuracy: 0.3577979712595097
Precision: 0.5153823067253851
Recall: 0.3577979712595097
F1: 0.2793449777017998

Best classifier: RandomForestClassifier


Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 76, in _cached_call
    return cache[method]
           ~~~~~^^^^^^^^
KeyError: 'predict_proba'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 316, in _score
    y_pred = method_caller(clf, "predict_proba", X)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Caskroom/miniforge/base/envs/amp_pd/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 78, in _ca

In [11]:
result_df = pd.DataFrame(results).T
result_df.to_csv('../results/tuned_classifier_comparison.csv')