In [1]:
# imports 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# Assuming you have your data in a Pandas DataFrame called 'data'
# where the first 4 columns are your features and the last column is the target variable.

# reg_input_allconns == every single connectivity outputted by DCM (64) + anxiety / depression scores 
data = pd.read_excel('data.xlsx',sheet_name = 'reg_input_allconns_4node')

# Step 1: Import data and separate features (X) and target (y)
X = data.iloc[:, 1:-1]  # First 4 columns are features
y = data.iloc[:, -1]   # Last column is the target variable

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train the logistic regression model
logreg = LogisticRegression(random_state = 42)
logreg.fit(X_train, y_train)

# Evaluate the model performance using k-fold cross-validation
num_folds = 10  # Change this number to modify the number of folds
cv_accuracy = cross_val_score(logreg, X_train, y_train, cv=num_folds)
average_accuracy = np.mean(cv_accuracy)

# Evaluate prediction accuracy 
y_pred = logreg.predict(X_test)
accuracy_logreg = accuracy_score(y_test,y_pred)

print("Prediction Accuracy using all connectivities + MH data:", accuracy_logreg)
print(f"\nAccuracy using {num_folds}-fold Cross-Validation: {average_accuracy:.2f}")

Prediction Accuracy using all connectivities + MH data: 0.8571428571428571

Accuracy using 10-fold Cross-Validation: 0.94


In [3]:
## feature selection -- reducing number of features to top importance 

# Step 4: Get the coefficients (feature importances) of the model
feature_importances = np.abs(logreg.coef_[0])  # Take the absolute values to handle negative coefficients

# Step 5: Select the top features with the highest importance/amount of covariance explained 
num_top_features = 10 # Change this number to select a different number of top features
top_feature_indices = np.argsort(feature_importances)[::-1][:num_top_features]
top_features = X.columns[top_feature_indices]

# Print the selected top features and covariance explained by feature 
print("Selected Top Features and Importance Scores:")
for feature, importance in zip(top_features, feature_importances[top_feature_indices]):
    print(f"{feature}: {importance}")

Selected Top Features and Importance Scores:
depression: 0.7864334575815304
anxiety: 0.6726835747510992
Pul-FFA: 0.6432661090019542
Amy-FFA: 0.4192452453353923
mOFC-Amy: 0.3554604262920525
FFA-Pul: 0.34396275785121844
mOFC-mOFC: 0.2498117977323696
Amy-Pul: 0.23980172672521025
Pul-Pul: 0.10875531445540156
Pul-Amy: 0.10188201994018085


In [4]:
# Step 6: Retrain the logistic regression model using only the selected top features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_top = X_train[top_features]
X_test_top = X_test[top_features]

logreg_top_features = LogisticRegression()
logreg_top_features.fit(X_train_top, y_train)

# Evaluate the model performance using k-fold cross-validation
num_folds = 10  # Change this number to modify the number of folds
cv_accuracy = cross_val_score(logreg_top_features, X_train_top, y_train, cv=num_folds)
average_accuracy = np.mean(cv_accuracy)

# Evaluate prediction accuracy 
y_pred_top = logreg_top_features.predict(X_test_top)
accuracy_logreg = accuracy_score(y_test,y_pred_top)

print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg_top_features.score(X_test_top, y_test)))
print(f"\nAccuracy using {num_folds}-fold Cross-Validation: {average_accuracy:.2f}")

Accuracy of logistic regression classifier on test set: 0.86

Accuracy using 10-fold Cross-Validation: 0.94


In [6]:
# testing model accuracy of just Mental Health data (anxiety / depression ratings )
data = pd.read_excel('data.xlsx',sheet_name = 'reg_input_mhonly')

X_train_mh, X_test_mh, y_train_mh, y_test_mh = train_test_split(X, y, test_size=0.2, random_state=42)

logreg_mh = LogisticRegression()
logreg_mh.fit(X_train_mh, y_train_mh)

# Evaluate the model performance using k-fold cross-validation
num_folds = 10  # Change this number to modify the number of folds
cv_accuracy = cross_val_score(logreg_mh, X_train_mh, y_train_mh, cv=num_folds)
average_accuracy_mh = np.mean(cv_accuracy)

# Evaluate prediction accuracy 
y_pred_mh = logreg_mh.predict(X_test_mh)
accuracy_logreg_mh = accuracy_score(y_test_mh,y_pred_mh)

## feature selection -- reducing number of features to top importance 

# Step 4: Get the coefficients (feature importances) of the model
feature_importances_mh = np.abs(logreg_mh.coef_[0])  # Take the absolute values to handle negative coefficients

# Step 5: Select the top features with the highest importance/amount of covariance explained 
num_top_features = 2 # Change this number to select a different number of top features
top_feature_indices_mh = np.argsort(feature_importances_mh)[::-1][:num_top_features]
top_features_mh = X.columns[top_feature_indices_mh]

# Print the selected top features and covariance explained by feature 
print("Covariance explained by each feature:")
for feature, importance in zip(top_features_mh, feature_importances_mh[top_feature_indices_mh]):
    print(f"{feature}: {importance}")

print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg_mh.score(X_test_mh, y_test_mh)))
print(f"\nAccuracy using {num_folds}-fold Cross-Validation: {average_accuracy_mh:.2f}")

Covariance explained by each feature:
depression: 0.7864334575815304
anxiety: 0.6726835747510992
Accuracy of logistic regression classifier on test set: 0.86

Accuracy using 10-fold Cross-Validation: 0.94


In [7]:
# testing model accuracy of mh data (anx/depression) + significantly different 
data = pd.read_excel('data.xlsx',sheet_name = 'reg_input_limitedconns_4node')

X_train_mhconn2, X_test_mhconn2, y_train_mhconn2, y_test_mhconn2 = train_test_split(X, y, test_size=0.2, random_state=42)

logreg_mhconn2 = LogisticRegression()
logreg_mhconn2.fit(X_train_mhconn2, y_train_mhconn2)

# Evaluate the model performance using k-fold cross-validation
num_folds = 10  # Change this number to modify the number of folds
cv_accuracy = cross_val_score(logreg_mhconn2, X_train_mhconn2, y_train_mhconn2, cv=num_folds)
average_accuracy_mhconn2 = np.mean(cv_accuracy)

# Evaluate prediction accuracy 
y_pred_mhconn2 = logreg_mhconn2.predict(X_test_mhconn2)
accuracy_logreg_mhconn2 = accuracy_score(y_test_mhconn2,y_pred_mhconn2)

print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg_mhconn2.score(X_test_mhconn2, y_test_mhconn2)))
print(f"\nAccuracy using {num_folds}-fold Cross-Validation: {average_accuracy_mhconn2:.2f}")

Accuracy of logistic regression classifier on test set: 0.86

Accuracy using 10-fold Cross-Validation: 0.94


# Method 2 -- recursive feature elimination w k-fold CV

In [8]:
# Recursive feature elimination with cross-validation 
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

# reg_input_allconns == every single connectivity outputted by DCM (64) + anxiety / depression scores 
data = pd.read_excel('data.xlsx',sheet_name = 'reg_input_allconns_4node')

feature_names = X.columns.values

# Step 1: Import data and separate features (X) and target (y)
X = data.iloc[:, 1:-1]  # All but first / last column are features
y = data.iloc[:, -1]   # Last column is the target variable

min_features_to_select = 1  # Minimum number of features to consider
clf = LogisticRegression()
cv = StratifiedKFold(5)

rfecv = RFECV(
    estimator=clf,
    step=1,
    cv=cv,
    scoring="accuracy",
    min_features_to_select=min_features_to_select,
    n_jobs=2,
)
rfecv.fit(X, y)

print(f"Optimal number of features: {rfecv.n_features_}")
print(
    "Features selected by forward sequential selection: "
    f"{feature_names[rfecv.get_support()]}"
)


Optimal number of features: 2
Features selected by forward sequential selection: ['Pul-FFA' 'depression']


In [9]:
# Recursive feature elimination with cross-validation -- mh only  
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

# reg_input_allconns == every single connectivity outputted by DCM (64) + anxiety / depression scores 
data = pd.read_excel('data.xlsx',sheet_name = 'reg_input_mhonly')


# Step 1: Import data and separate features (X) and target (y)
X = data.iloc[:, 1:-1]  # All but first / last column are features
y = data.iloc[:, -1]   # Last column is the target variable
feature_names = X.columns.values


min_features_to_select = 1  # Minimum number of features to consider
clf = LogisticRegression()
cv = StratifiedKFold(5)

rfecv = RFECV(
    estimator=clf,
    step=1,
    cv=cv,
    scoring="accuracy",
    min_features_to_select=min_features_to_select,
    n_jobs=2,
)
rfecv.fit(X, y)

print(f"Optimal number of features: {rfecv.n_features_}")
print(
    "Features selected by forward sequential selection: "
    f"{feature_names[rfecv.get_support()]}"
)

Optimal number of features: 1
Features selected by forward sequential selection: ['depression']


In [10]:
# Recursive feature elimination with cross-validation -- mh + significantly diff connectivies   
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

# reg_input_allconns == every single connectivity outputted by DCM (64) + anxiety / depression scores 
data = pd.read_excel('data.xlsx',sheet_name = 'reg_input_limitedconns_4node')

# Step 1: Import data and separate features (X) and target (y)
X = data.iloc[:, 1:-1]  # All but first / last column are features
y = data.iloc[:, -1]   # Last column is the target variable

feature_names = X.columns.values

min_features_to_select = 1  # Minimum number of features to consider
clf = LogisticRegression()
cv = StratifiedKFold(5)

rfecv = RFECV(
    estimator=clf,
    step=1,
    cv=cv,
    scoring="accuracy",
    min_features_to_select=min_features_to_select,
    n_jobs=2,
)
rfecv.fit(X, y)

print(f"Optimal number of features: {rfecv.n_features_}")
print(
    "Features selected by forward sequential selection: "
    f"{feature_names[rfecv.get_support()]}"
)

Optimal number of features: 1
Features selected by forward sequential selection: ['depression']


# Method 3: sequential feature selection (both forward / backward)

In [11]:
## # sequential feature selector 

from sklearn.feature_selection import SequentialFeatureSelector
from time import time
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeCV

# reg_input_allconns == every single connectivity outputted by DCM (64) + anxiety / depression scores 
data = pd.read_excel('data.xlsx',sheet_name = 'reg_input_allconns_4node')
X = data.iloc[:, 1:-1]  # 2nd - 2nd to end columns are features (ignoring subjectname + group)
y = data.iloc[:, -1]   # Last column is the target variable

feature_names = X.columns.values

logreg1 = LogisticRegression()
#logreg_ridge = RidgeCV(logreg1.fit(X,y))
# starts with no features and adds one by one 
tic_fwd = time()
sfs_forward = SequentialFeatureSelector(
    logreg1, n_features_to_select=2, direction="forward"
).fit(X, y)
toc_fwd = time()

# backwards -- starts w all features and slowly adds each one 
tic_bwd = time()
sfs_backward = SequentialFeatureSelector(
    logreg1, n_features_to_select=2, direction="backward"
).fit(X, y)
toc_bwd = time()

print(
    "Features selected by forward sequential selection: "
    f"{feature_names[sfs_forward.get_support()]}"
)
print(f"Done in {toc_fwd - tic_fwd:.3f}s")
print(
    "Features selected by backward sequential selection: "
    f"{feature_names[sfs_backward.get_support()]}"
)
print(f"Done in {toc_bwd - tic_bwd:.3f}s")


Features selected by forward sequential selection: ['Amy-Amy' 'depression']
Done in 0.443s
Features selected by backward sequential selection: ['Pul-Pul' 'depression']
Done in 2.449s
