In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rc

# Set the global font to be DejaVu Sans, size 10 (or any other sans-serif font of your choice!)
rc('font',**{'family':'sans-serif','sans-serif':['DejaVu Sans'],'size':20})

# Set the font used for MathJax - more on this later
rc('mathtext',**{'default':'regular'})


from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score,ConfusionMatrixDisplay, balanced_accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve, RocCurveDisplay
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_val_score, LeaveOneOut, GridSearchCV, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from numpy import sort

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE,BorderlineSMOTE,ADASYN
from imblearn.under_sampling import RandomUnderSampler


In [2]:
# load data
DataControls = pd.read_csv('DataFromControls.csv')
DataParkinsons = pd.read_csv('DataFromParkinsons.csv')
DataRBD = pd.read_csv('DataFromRBD.csv')

In [3]:
# define data groups

selectedCols = ['MeanAmplitude','MeanSpeed', 'amplitudeDecay','velocityDecay']

X1 = DataControls[selectedCols].values.astype(float)
y1 = [0]*len(X1)

X2 = DataRBD[selectedCols].values.astype(float)
y2 = [1]*len(X2)


X3 = DataParkinsons[selectedCols].values.astype(float)
y3 = [1]*len(X3)


XCR = np.concatenate([X1, X2], axis=0)
yCR = np.array([0]*len(X1)+[1]*len(X2))

XCP = np.concatenate([X1,X3], axis=0)
yCP = np.array([0]*len(X1)+[1]*len(X3))


XRP = np.concatenate([X2, X3], axis=0)
yRP = np.array([0]*len(X2)+[1]*len(X3))

X = np.concatenate([X1, X2, X3], axis=0)
y = np.array(y1+y2+y3)


unique, counts = np.unique(y, return_counts=True)
print('original',dict(zip(unique, counts)))

original {np.int64(0): np.int64(48), np.int64(1): np.int64(39)}


We train different binary models to classify the groups. We use the following models:
- Logistic Regression
- Support Vector Machines
- Random Forest 
- Gradient Boosting


The results of this analysis might vary from trial to trial as the SMOTE algorithm adds randomness to the data. Repeat the analysis many times to obtain an average. 

In [34]:
# Controls vs. PD

# define pipeline
over = SMOTE()
under = RandomUnderSampler()
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
# transform the dataset
Xaug, yaug = pipeline.fit_resample(XCP, yCP)
unique, counts = np.unique(yaug, return_counts=True)
print('augmented',dict(zip(unique, counts)))   

#defining the cross-validation
cv = LeaveOneOut()


# Select a model, you can use any model you like. We present results with Random Forest

# model, scale = LogisticRegression(), True  ##need to scale data 
# model, scale = svm.SVC(), True  ##need to scale data 
model, scale = RandomForestClassifier(), False ##no need to scale data 
# model, scale =   XGBClassifier(objective= 'binary:logistic',seed=42,n_jobs=-1,nthread=1,early_stopping_rounds=None,eval_metric='logloss',use_label_encoder=False,verbosity=0), False

# Initialize list to store predictions
predictions = []
true_labels = []

# Perform LOO CV
for train_index, test_index in cv.split(Xaug):
    X_train, X_test = Xaug[train_index], Xaug[test_index]
    y_train, y_test = yaug[train_index], yaug[test_index]

     ## Scale the data
    if scale:
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
    
        # Fit the model
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    
    predictions.append(y_pred[0])
    true_labels.append(y_test[0])

# Calculate the accuracy
accuracy = balanced_accuracy_score(true_labels, predictions)
f1score = f1_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
precision = precision_score(true_labels, predictions)
tn, fp, fn, tp = confusion_matrix(true_labels, predictions).ravel()
sensitivity = tp /(tp+fn)
specificity = tn / (tn+fp)
print(f"LOO CV Accuracy: {accuracy:.2f}")
print(f"LOO CV F1-score: {f1score:.2f}")
print(f"LOO CV recall: {recall:.2f}")
print(f"LOO CV precision: {precision:.2f}")
print(f"LOO CV sensitivity: {sensitivity:.2f}")
print(f"LOO CV specificity: {specificity :.2f}")
print( confusion_matrix(true_labels, predictions))


augmented {np.int64(0): np.int64(48), np.int64(1): np.int64(48)}
LOO CV Accuracy: 0.88
LOO CV F1-score: 0.88
LOO CV recall: 0.92
LOO CV precision: 0.85
LOO CV sensitivity: 0.92
LOO CV specificity: 0.83
[[40  8]
 [ 4 44]]


In [72]:
# Controls vs. RBD

# define pipeline
over = SMOTE()
under = RandomUnderSampler()
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
# transform the dataset
Xaug, yaug = pipeline.fit_resample(XCR, yCR)
unique, counts = np.unique(yaug, return_counts=True)
print('augmented',dict(zip(unique, counts)))   

#defining the cross-validation
cv = LeaveOneOut()


# Select a model, you can use any model you like. We present results with Random Forest

# model, scale = LogisticRegression(), True  ##need to scale data 
# model, scale = svm.SVC(), True  ##need to scale data 
model, scale = RandomForestClassifier(), False ##no need to scale data 
# model, scale =   XGBClassifier(objective= 'binary:logistic',seed=42,n_jobs=-1,nthread=1,early_stopping_rounds=None,eval_metric='logloss',use_label_encoder=False,verbosity=0), False

# Initialize list to store predictions
predictions = []
true_labels = []

# Perform LOO CV
for train_index, test_index in cv.split(Xaug):
    X_train, X_test = Xaug[train_index], Xaug[test_index]
    y_train, y_test = yaug[train_index], yaug[test_index]

    ## Scale the data or no
    if scale:
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
    
        # Fit the model
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    
    predictions.append(y_pred[0])
    true_labels.append(y_test[0])

# Calculate the accuracy
accuracy = balanced_accuracy_score(true_labels, predictions)
f1score = f1_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
precision = precision_score(true_labels, predictions)
tn, fp, fn, tp = confusion_matrix(true_labels, predictions).ravel()
sensitivity = tp /(tp+fn)
specificity = tn / (tn+fp)
print(f"LOO CV Accuracy: {accuracy:.2f}")
print(f"LOO CV F1-score: {f1score:.2f}")
print(f"LOO CV recall: {recall:.2f}")
print(f"LOO CV precision: {precision:.2f}")
print(f"LOO CV sensitivity: {sensitivity:.2f}")
print(f"LOO CV specificity: {specificity :.2f}")
print( confusion_matrix(true_labels, predictions))


augmented {np.int64(0): np.int64(48), np.int64(1): np.int64(48)}
LOO CV Accuracy: 0.80
LOO CV F1-score: 0.81
LOO CV recall: 0.83
LOO CV precision: 0.78
LOO CV sensitivity: 0.83
LOO CV specificity: 0.77
[[37 11]
 [ 8 40]]


In [102]:
# RBD vs. PD

# define pipeline
over = SMOTE()
under = RandomUnderSampler()
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
# transform the dataset
Xaug, yaug = pipeline.fit_resample(XRP, yRP)
unique, counts = np.unique(yaug, return_counts=True)
print('augmented',dict(zip(unique, counts)))   

#defining the cross-validation
cv = LeaveOneOut()


# Select a model, you can use any model you like. We present results with Random Forest

# model, scale = LogisticRegression(), True  ##need to scale data 
# model, scale = svm.SVC(), True  ##need to scale data 
model, scale = RandomForestClassifier(), False ##no need to scale data 
# model, scale =   XGBClassifier(objective= 'binary:logistic',seed=42,n_jobs=-1,nthread=1,early_stopping_rounds=None,eval_metric='logloss',use_label_encoder=False,verbosity=0), False

# Initialize list to store predictions
predictions = []
true_labels = []

# Perform LOO CV
for train_index, test_index in cv.split(Xaug):
    X_train, X_test = Xaug[train_index], Xaug[test_index]
    y_train, y_test = yaug[train_index], yaug[test_index]

    ## Scale the data or no
    if scale:
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
    
        # Fit the model
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    
    predictions.append(y_pred[0])
    true_labels.append(y_test[0])

# Calculate the accuracy
accuracy = balanced_accuracy_score(true_labels, predictions)
f1score = f1_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
precision = precision_score(true_labels, predictions)
tn, fp, fn, tp = confusion_matrix(true_labels, predictions).ravel()
sensitivity = tp /(tp+fn)
specificity = tn / (tn+fp)
print(f"LOO CV Accuracy: {accuracy:.2f}")
print(f"LOO CV F1-score: {f1score:.2f}")
print(f"LOO CV recall: {recall:.2f}")
print(f"LOO CV precision: {precision:.2f}")
print(f"LOO CV sensitivity: {sensitivity:.2f}")
print(f"LOO CV specificity: {specificity :.2f}")
print( confusion_matrix(true_labels, predictions))


augmented {np.int64(0): np.int64(21), np.int64(1): np.int64(21)}
LOO CV Accuracy: 0.74
LOO CV F1-score: 0.74
LOO CV recall: 0.76
LOO CV precision: 0.73
LOO CV sensitivity: 0.76
LOO CV specificity: 0.71
[[15  6]
 [ 5 16]]
