In [None]:
from platform import python_version

print(python_version()) # check python version

In [None]:
# load library
import numpy as np
import pandas as pd
# library version
print(pd.__version__)
print(np.__version__)

In [None]:
input_dir = '...'

In [None]:
# load data with pandas
data = pd.read_csv(input_dir +'pheno_met.csv')
data.head(3)

In [None]:
data['Min_Cortisol_Group'] = data['Cortisol'] > 18
data['Min_Cortisol_Group']
data['Min_Cortisol_Group'].value_counts()

In [None]:
# list the input columns in the list input_cols: 
# M100022013: tetrahydrocortisol glucuronide; M100022127: tetrahydrocortisone glucuronide (5); M100000963: homocitrulline
input_cols = ['M100022013','M100022127','M100000963','all']

# put target name in target variable
target = 'Min_Cortisol_Group'

####run models
#import
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

#result list
results = []

#loop 10 runs with set seeds
for seed in [111, 222, 333, 444, 555, 666, 777, 888, 999, 101]:
    #set seed
    np.random.seed(seed)

    #train test split
    train_data, test_data = train_test_split(data, test_size = 0.2)
    
    #loop through metabolite
    for met in input_cols:
        if met != 'all':
            input_list = [met]
        else:
            input_list = ['M100022013','M100022127','M100000963']
        processing_pipeline = ColumnTransformer([("selector", "passthrough", input_list)], remainder="drop")
        
        #logistic
        logistic = LogisticRegression(max_iter=10000)
        param_grid = [{'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1 , 5, 10, 50, 100]}]
        grid_search = GridSearchCV(logistic, param_grid, cv=5, scoring='roc_auc', return_train_score=True)

        logistic_pipeline = Pipeline([
            ('processing', processing_pipeline),
            ('modeling', grid_search)
        ])

        logistic_pipeline.fit(train_data, train_data[target])
        logistic_train_auc = logistic_pipeline['modeling'].best_score_
        logistic_test_auc = logistic_pipeline.score(test_data, test_data[target])
        
        #SVM
        svc = SVC()
        param_grid = [{
            'C': [0.001, 0.01, 0.1, 1, 10, 100],
            'kernel' : ['rbf'],
            'gamma' : [0.001, 0.01, 0.1, 1, 10, 100]
        }]

        grid_search = GridSearchCV(svc, param_grid, cv=5, scoring='roc_auc', return_train_score=True)

        svc_pipeline = Pipeline([
            ('processing', processing_pipeline),
            ('modeling', grid_search)
        ])

        svc_pipeline.fit(train_data[:2000], train_data[target][:2000])
        svc_train_auc = svc_pipeline['modeling'].best_score_
        svc_test_auc = svc_pipeline.score(test_data, test_data[target])
        
        #RF
        param_grid = [{
            'max_depth': [3, 4, 5, 6],
            'min_samples_split' : [0.05, 0.1, 0.2],
            'min_samples_leaf' : [0.05, 0.1, 0.2],
            'n_estimators': [10, 20, 50, 100]
        }]

        forest = RandomForestClassifier()

        grid_search = GridSearchCV(forest, param_grid, cv=5, scoring='roc_auc', return_train_score=True)

        forest_pipeline = Pipeline([
            ('processing', processing_pipeline),
            ('modeling', grid_search)
        ])

        forest_pipeline.fit(train_data, train_data[target])
        forest_train_auc = forest_pipeline['modeling'].best_score_
        forest_test_auc = forest_pipeline.score(test_data, test_data[target])
        
        #GB (use same parameter grid with RF, so no needs to redefine)
        gbc = GradientBoostingClassifier()

        grid_search = GridSearchCV(gbc, param_grid, cv=5, scoring='roc_auc', return_train_score=True)

        gbc_pipeline = Pipeline([
            ('processing', processing_pipeline),
            ('modeling', grid_search)
        ])

        gbc_pipeline.fit(train_data, train_data[target])
        gbc_train_auc = gbc_pipeline['modeling'].best_score_
        gbc_test_auc = gbc_pipeline.score(test_data, test_data[target])
        
        #MLP
        n_features = len(input_cols)

        param_grid = [{
            'hidden_layer_sizes' : [[n_features // 2, n_features // 2],
                                    [n_features // 2, n_features // 2, n_features // 2],
                                    [n_features, n_features],
                                    [n_features, n_features, n_features],
                                    [n_features*2, n_features*2],
                                    [n_features*2, n_features*2, n_features*2]],
            'alpha' : [0.001, 0.01, 0.1, 1, 10]                                    #regularization terms
        }]

        mlp = MLPClassifier(max_iter=10000)
        grid_search = GridSearchCV(mlp, param_grid, cv=5, scoring='roc_auc', return_train_score=True)

        mlp_pipeline = Pipeline([
            ('processing', processing_pipeline),
            ('modeling', grid_search)
        ])

        mlp_pipeline.fit(train_data, train_data[target])
        mlp_train_auc = mlp_pipeline['modeling'].best_score_
        mlp_test_auc = mlp_pipeline.score(test_data, test_data[target])
        results.append([seed, met, 
                        logistic_train_auc,
                        logistic_test_auc,
                        svc_train_auc,
                        svc_test_auc,
                        forest_train_auc,
                        forest_test_auc,
                        gbc_train_auc,
                        gbc_test_auc,
                        mlp_train_auc,
                        mlp_test_auc])

In [None]:
results_df = pd.DataFrame(results)
results_df.columns = ['seed', 'metabolite', 
                      'logistic_train_auc',
                      'logistic_test_auc',
                      'svc_train_auc',
                      'svc_test_auc',
                      'forest_train_auc',
                      'forest_test_auc',
                      'gbc_train_auc',
                      'gbc_test_auc',
                      'mlp_train_auc',
                      'mlp_test_auc']
results_df

In [None]:
output_dir = '...'
results_df.to_csv(output_dir + 'Prediction_model_result_of_validated_sig_metabolites.csv', index = None)

In [None]:
print('done')