Import biblios

In [1]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import RepeatedStratifiedKFold, permutation_test_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import classification_report

Load and verify dataset

In [5]:
data=pd.read_excel('/Users/ctc/Desktop/ccl/paper/Pasta4.xlsx')
data.shape

(45, 12)

In [6]:
data['groups'].unique()

array(['PIB+', 'PIB-', 'control'], dtype=object)

data description and export

In [7]:
table = pd.DataFrame(data.groupby('groups').describe())
table.to_excel('table.xlsx')
print(table)

        l_hipp_meanbold                                                       \
                  count      mean        std    min      25%     50%     75%   
groups                                                                         
PIB+               17.0  7.962471  13.137149 -24.35  2.58500  6.3290  15.970   
PIB-               14.0  7.043714   9.763331 -14.03  0.93165  9.1330  10.885   
control            14.0  8.339780  10.182454 -11.30  3.03800  9.1085  14.045   

               r_hipp_meanbold            ... l_hipp_meanbold_NvsL         \
           max           count      mean  ...                  75%    max   
groups                                    ...                               
PIB+     34.02            17.0  7.571412  ...             12.45000  24.79   
PIB-     22.98            14.0  6.138643  ...              8.82425  21.58   
control  29.70            14.0  4.328343  ...             14.82750  23.79   

        r_hipp_meanbold_NvsL                            

full model variables set and print

In [8]:
X = data.iloc[:,1:9]
y = data['groups']
print(data.iloc[0,1:9])
X.shape

l_hipp_meanbold    -3.987
r_hipp_meanbold    -1.732
l_hipp_volume      4149.3
r_hipp_volume      4285.0
TSD                     7
THD                     3
logic_mem_II           24
RAVLTII                13
Name: 0, dtype: object


(45, 8)

init modeling

In [14]:
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=10)

nesting a pipeline for uniformization and analysis

In [15]:
pipeline = Pipeline([
    ('normalizer', StandardScaler()), #Step1 - normalize data, warm_start=True))
    ('clf', LogisticRegressionCV(cv=cv, max_iter=10000, random_state=10)) #step2 - classifier
])
pipeline.steps

[('normalizer', StandardScaler()),
 ('clf',
  LogisticRegressionCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=5, random_state=10),
                       max_iter=10000, random_state=10))]

init model

In [16]:
model = pipeline.fit(X, y)
y_pred = model.predict(X)
print(classification_report(y, y_pred))
print(pipeline.named_steps['clf'].coef_)
np.exp(pipeline.named_steps['clf'].coef_)

              precision    recall  f1-score   support

        PIB+       0.71      0.88      0.79        17
        PIB-       0.78      0.50      0.61        14
     control       0.80      0.86      0.83        14

    accuracy                           0.76        45
   macro avg       0.76      0.75      0.74        45
weighted avg       0.76      0.76      0.75        45

[[-0.02550747  0.04008336 -0.19145622 -0.0902727  -0.19101756 -0.15014764
  -0.18358699 -0.0793919 ]
 [-0.01740301  0.00495634  0.16061191 -0.00379267 -0.05362268 -0.0836022
   0.09970925 -0.14826845]
 [ 0.04291048 -0.0450397   0.03084431  0.09406538  0.24464024  0.23374983
   0.08387774  0.22766035]]


array([[0.9748151 , 1.04089754, 0.82575578, 0.91368199, 0.82611808,
        0.86058091, 0.83227947, 0.92367786],
       [0.98274755, 1.00496864, 1.17422918, 0.99621451, 0.94778966,
        0.91979708, 1.10484963, 0.86219963],
       [1.04384445, 0.95595953, 1.03132492, 1.09863157, 1.27716176,
        1.26332841, 1.08749593, 1.25565877]])



---



hyperparameters tunning model with set separation and gridsearch cross-validation 

In [None]:
# define search space
from scipy.stats import loguniform
from sklearn.model_selection import GridSearchCV
pipeline = Pipeline([
    ('normalizer', StandardScaler()),
    ('clf', LogisticRegression(solver='liblinear', max_iter=10000, random_state=10, warm_start=True))])
space = dict()
space ['multi_class'] = ['multinomial', 'ovr']
space['solver'] = ['liblinear','lbfgs', 'newton-cg', 'sag', 'saga']
space['penalty'] = [ 'none', 'l1', 'l2']
space['C'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
cv = RepeatedStratifiedKFold(random_state=1)
search = GridSearchCV(pipeline.named_steps['clf'], space, cv=cv, n_jobs=-1)
result = search.fit(X, y)

In [None]:
y_pred = search.predict(X)
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Best Score: 0.6415961199294533
Best Hyperparameters: {'C': 0.01, 'multi_class': 'ovr', 'penalty': 'l2', 'solver': 'newton-cg'}


In [None]:
print(classification_report(y, y_pred))
metrics = pd.DataFrame()
metrics.to_csv('metrics.csv')
print(metrics)

              precision    recall  f1-score   support

        PIB+       0.75      0.88      0.81        17
        PIB-       0.75      0.64      0.69        14
     control       0.77      0.71      0.74        14

    accuracy                           0.76        45
   macro avg       0.76      0.75      0.75        45
weighted avg       0.76      0.76      0.75        45

Empty DataFrame
Columns: []
Index: []


obtaining and exporting coeficients

In [None]:
coefs = pd.DataFrame(result.best_estimator_.coef_)
coefs.to_excel('coefsccl.xlsx')
print(coefs)


          0         1         2  ...         5         6         7
0 -0.040260  0.026463 -0.004082  ... -0.044157 -0.113139 -0.015201
1  0.006441  0.026547  0.006134  ... -0.018244  0.088427 -0.106843
2  0.040796 -0.066044 -0.002174  ...  0.058361  0.020619  0.109684

[3 rows x 8 columns]


obtaining and exporting odds-ratios

In [None]:
odds = np.exp(pd.DataFrame(result.best_estimator_.coef_))
odds.to_excel('oddsccl.xlsx')
print(odds)

          0         1         2  ...         5         6         7
0  0.960539  1.026816  0.995926  ...  0.956803  0.893027  0.984914
1  1.006462  1.026902  1.006153  ...  0.981922  1.092455  0.898666
2  1.041639  0.936089  0.997828  ...  1.060097  1.020833  1.115925

[3 rows x 8 columns]


 finding p-value for the model




In [None]:
permutation_test_score(model, X, y)

(0.6222222222222221,
 array([0.33333333, 0.28888889, 0.42222222, 0.4       , 0.4       ,
        0.33333333, 0.35555556, 0.26666667, 0.33333333, 0.35555556,
        0.44444444, 0.33333333, 0.4       , 0.46666667, 0.31111111,
        0.33333333, 0.31111111, 0.28888889, 0.44444444, 0.37777778,
        0.44444444, 0.37777778, 0.37777778, 0.33333333, 0.31111111,
        0.42222222, 0.35555556, 0.35555556, 0.35555556, 0.4       ,
        0.4       , 0.37777778, 0.4       , 0.35555556, 0.37777778,
        0.28888889, 0.35555556, 0.4       , 0.44444444, 0.4       ,
        0.35555556, 0.33333333, 0.44444444, 0.46666667, 0.42222222,
        0.4       , 0.33333333, 0.35555556, 0.28888889, 0.31111111,
        0.33333333, 0.35555556, 0.26666667, 0.33333333, 0.44444444,
        0.33333333, 0.37777778, 0.33333333, 0.4       , 0.28888889,
        0.44444444, 0.33333333, 0.31111111, 0.48888889, 0.28888889,
        0.4       , 0.35555556, 0.28888889, 0.31111111, 0.28888889,
        0.26666667, 0.35555