# Imports and configuration

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
code_dir=Path.cwd()
project_dir=code_dir.parent
input_dir=project_dir/"input"
output_dir=project_dir/"output/ml_prediction"
tmp_dir=project_dir/"tmp"

output_dir.mkdir(exist_ok=True)

In [7]:
prediction_df = pd.read_csv(input_dir/"prediction_df.csv", index_col=0)
roi_df_ct = pd.read_csv(input_dir/"ROI_catROIs_aparc_DK40_thickness.csv")

In [10]:
ct_columns = prediction_df.loc[:,prediction_df.columns.isin(list(roi_df_ct.columns))].columns.tolist()
fa_columns = [col for col in roi_df.columns if "_FA" in col and "_FAt" not in col]
fat_columns = [col for col in roi_df.columns if "_FAt" in col]
md_columns = [col for col in roi_df.columns if "_MD" in col and "_MDt" not in col]
fw_columns = [col for col in roi_df.columns if "_FW" in col]
fd_columns = [col for col in roi_df.columns if "_fd" in col and "_fdc" not in col]
fc_columns = [col for col in roi_df.columns if "_logfc" in col]
fdc_columns = [col for col in roi_df.columns if "_fdc" in col]
complexity_columns = [col for col in roi_df.columns if "_complexity" in col]

In [11]:
X_columns = [ct_columns,
fa_columns,
fat_columns,
md_columns,
fw_columns,
fd_columns,
fc_columns,
fdc_columns,
complexity_columns]

# Prediction

In [None]:
import os
os.environ["PYTHONWARNINGS"] = "ignore::UserWarning"

from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_validate, cross_val_predict, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import pingouin as pg

cv_df = pd.DataFrame(columns=X_metrics)               # Dataframe to store CV scores
iterations=100

X_metrics = ["ct","fa","fat","md","fw","fd","fc","fdc","complexity"]
y_metric = "cohort"
y = prediction_df[y_metric]


for i in range(iterations):

      for idx, X_cols in enumerate(X_columns):

            # Define input data
            X = prediction_df[X_cols]
            metric = X_metrics[idx]

            # Declare the inner and outer cross-validation strategies
            inner_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=i)
            outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=i)

            # Inner cross-validation for parameter search
            model = Pipeline(steps=[("scaling",StandardScaler()),
                  ("logreg", LogisticRegressionCV(cv=inner_cv, penalty="elasticnet", solver="saga", l1_ratios=[.1, .5, .7, .9, .95, .99, 1], n_jobs=16))            
            ])
            
            # Outer cross-validation to compute the testing score
            cv_results = cross_validate(model, X, y, cv=outer_cv, n_jobs=16, return_estimator=True)

            # Store score in cv_df
            cv_df.loc[f"accuracy_mean_{i}",metric] = cv_results['test_score'].mean()

            print(f"{metric}_{i}: the R2 using nested cross-validation is: " +
                  f"{cv_df.loc[f'accuracy_mean_{i}',metric]:.3f} +/- {cv_results['test_score'].std():.3f}")

# Save dataframes for coefficients and scores
cv_df.to_csv(output_dir/"cv_df.csv", sep=" ")

# Null predictions

In [None]:
from numpy.random import permutation

from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import cross_val_score
simplefilter("ignore", category=ConvergenceWarning)

permutations = 1000

if Path(output_dir/"permutation_df.csv").exists(): 
      permutation_df = pd.read_csv(output_dir/"permutation_df.csv", index_col=0)
else: 
      permutation_df = pd.DataFrame(columns=X_metrics)

i = len(permutation_df[~permutation_df[metric].isnull()][metric])

y_metric = "cohort"
y = prediction_df[y_metric]

while i <= permutations:

      y_permute = permutation(y)

      for idx, X_cols in enumerate(X_columns):

            # Define input data
            X = prediction_df[X_cols]
            metric = X_metrics[idx]

            # Declare the inner and outer cross-validation strategies
            inner_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=i)
            outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=i)

            # Inner cross-validation for parameter search
            model = Pipeline(steps=[("scaling",StandardScaler()),
                  ("logreg", LogisticRegressionCV(cv=inner_cv, penalty="elasticnet", solver="saga", l1_ratios=[.1, .5, .7, .9, .95, .99, 1], n_jobs=16))            
            ])
            
            # Outer cross-validation to compute the testing score
            test_score = cross_val_score(model, X, y_permute, cv=outer_cv, n_jobs=16)
            if test_score.mean() == None: continue

            permutation_df.loc[f"accuracy_permutation_{i}",metric] = test_score.mean()

            print(f"{metric}_{i}: the R2 using nested cross-validation is: " +
                  f"{test_score.mean():.3f} +/- {cv_results['test_score'].std():.3f}")
            
      i = len(permutation_df[~permutation_df[metric].isnull()][metric])
      
      if i % 100 == 0: print(i)

permutation_df.to_csv(output_dir/"permutation_df.csv")

# Plotting

In [15]:
cv_df =pd.read_csv(output_dir/"cv_df.csv", sep=" ", index_col=0)
cv_df.index.rename="iteration"

permutation_df =pd.read_csv(output_dir/"permutation_df.csv", index_col=0)
permutation_df.index.rename="iteration"

In [16]:
cv_median = cv_df.median()
cv_median_dict = cv_median.to_dict()

In [17]:
from scipy.stats import percentileofscore
from pingouin import multicomp
n_tests = len(X_metrics)
permutation_pval_dict = dict()
permutation_sig_dict = dict()
for idx,col in enumerate(permutation_df.columns):

    if cv_median[idx] > 0:
        percentile = percentileofscore(permutation_df[col], cv_median[idx])
        pval = (100 - percentile) / 100
        permutation_pval_dict[col] = pval
    if cv_median[idx] < 0:
        percentile = percentileofscore(permutation_df[col], cv_median[idx])
        pval = percentile / 100
        permutation_pval_dict[col] = pval

multicomp_results = multicomp(list(permutation_pval_dict.values()), method="bonf")


permutation_sig_dict = dict(zip(X_metrics, multicomp_results[0]))
permutation_pval_fdr_dict = dict(zip(X_metrics, multicomp_results[1]))
cv_median_height_dict = dict(zip(cv_median_dict.values(),[0.86] * 10))
permutation_pval_height_dict = dict(zip(list(permutation_sig_dict.keys()),[0.1] * len(permutation_sig_dict.keys())))

In [18]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.style as style

In [19]:
X_metrics_plot = ['ct', 'fd', 'fc', 'fdc', 'complexity', 'fa', 'fat', 'md', 'fw']
X_metrics_plot_styled = ['CT', 'FD', 'Log. FC', 'FDC', 'CX', 'FA', f'$FA_{{T}}$', 'MD', 'FW']

In [None]:
style.use("ggplot")
plt.tight_layout()

data = pd.melt(cv_df[X_metrics_plot].reset_index(), id_vars="index")
x = "variable"

y = "value"
xlabel=""
ylabel="Accuracy"
palette="Blues_r"
fig,ax = plt.subplots(1, figsize=(10,6))

plot = sns.boxplot(data=data, x=x, y=y, palette=palette, ax=ax, width=.6, linewidth=1.5, fliersize=0, zorder=1)

ax.set_ylabel(ylabel, size=15)
ax.set_xlabel(xlabel, size=15)

plot.annotate(f"*: $p$<0.05", (8,0.105), ha="right", va="center", size=12)
plt.xticks(np.arange(0,len(X_metrics_plot_styled)),X_metrics_plot_styled)

for idx,(key,value) in enumerate(permutation_sig_dict.items()):
    height = permutation_pval_height_dict[key]
    if value == 1:
        plot.annotate("*", 
                    (idx, height), 
                    ha = 'center', va = 'center', 
                    size=15,
                    xytext = (1, 1), 
                    textcoords = 'offset points')

plt.savefig(output_dir/"prediction_results.png", dpi=300, bbox_inches="tight")