In [None]:
#getting and working with data
import pandas as pd
import numpy as np
import re
import os
import scipy as sp

#visualizing results
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_context('poster', rc={'font.size':35,
                              'axes.titlesize':50,
                              'axes.labelsize':35})

#machine learning
from sklearn.preprocessing import StandardScaler, Normalizer, LabelEncoder, PolynomialFeatures
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold, train_test_split, cross_val_score, cross_val_predict, GridSearchCV, RandomizedSearchCV

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC 
#import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn import metrics
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error, roc_auc_score, classification_report

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

### potential questions

- can Meso scale predict TBI vs DC? 

- does Meso at visit one predict x outcome at visit 2 (pych and imaging outcomes) -- raw visit 2 number or 2 vs 1 difference?

- network analysis with PET data?


- role for ApoE type
- role for prazosin
- role for drinking type? MIP1a/b - VP connection?


- plasma: bFGF, VEGFA, MDC, IL1a, IL7
- CSF: CRP, MCP-1, IL1a, INFy

In [None]:
data_path = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/PeskindTBI/final_data/first_visits_short.csv'

In [None]:
#read in csv containing data from all surveys and all visitseqs
data = pd.read_csv(data_path, index_col=0)
data = pd.DataFrame(data = data)

print('Original data shape:\n', data.shape, '\n')
#create class variable
data['class'] = [0 if x == 'C' else 1 for x in data['Group']]
print(data.info())
data.tail(1)

In [None]:
#determined outliers for auditc and QBlstExp (outlier = >3 SD from mean) and remove
data = data[data["TBIID"] != 'C010']
data = data[data["TBIID"] != 'T080']

### Can MESO predict C or T?

In [None]:
#only use first visit sequence
data_v1 = data[data['VisitSeq'] == 1]

In [None]:
#explore meso data differences between groups

meso_columns = ['Plasma1_bFGF',
       'Plasma1_CRP', 'Plasma1_Eotaxin', 'Plasma1_Eotaxin3',
       'Plasma1_Flt1', 'Plasma1_ICAM1', 'Plasma1_IFNγ', 'Plasma1_IL10',
       'Plasma1_IL12_IL23p40', 'Plasma1_IL12p70', 'Plasma1_IL15',
       'Plasma1_IL16', 'Plasma1_IL17A', 'Plasma1_IL1α', 'Plasma1_IL6',
       'Plasma1_IL7', 'Plasma1_IL8', 'Plasma1_IP10', 'Plasma1_MCP1',
       'Plasma1_MCP4', 'Plasma1_MDC', 'Plasma1_MIP1α', 'Plasma1_MIP1β',
       'Plasma1_PlGF', 'Plasma1_SAA', 'Plasma1_TARC', 'Plasma1_Tie2',
       'Plasma1_TNFα', 'Plasma1_TNFβ', 'Plasma1_VCAM1', 'Plasma1_VEGF',
       'Plasma1_VEGFC', 'Plasma1_VEGFD']

for param in meso_columns:
    print(param)
    
    try:
        g = sns.catplot(x='Group', y=param, kind='bar', data=data_v1, ci=68, height=5, aspect=4)
        plt.show()
        
        print('\n')
        
    except:
        pass

In [None]:
data_v1_meso = data_v1[['class', 'Plasma1_bFGF',
       'Plasma1_CRP', 'Plasma1_Eotaxin', 'Plasma1_Eotaxin3',
       'Plasma1_Flt1', 'Plasma1_ICAM1', 'Plasma1_IFNγ', 'Plasma1_IL10',
       'Plasma1_IL12_IL23p40', 'Plasma1_IL12p70', 'Plasma1_IL15',
       'Plasma1_IL16', 'Plasma1_IL17A', 'Plasma1_IL1α', 'Plasma1_IL6',
       'Plasma1_IL7', 'Plasma1_IL8', 'Plasma1_IP10', 'Plasma1_MCP1',
       'Plasma1_MCP4', 'Plasma1_MDC', 'Plasma1_MIP1α', 'Plasma1_MIP1β',
       'Plasma1_PlGF', 'Plasma1_SAA', 'Plasma1_TARC', 'Plasma1_Tie2',
       'Plasma1_TNFα', 'Plasma1_TNFβ', 'Plasma1_VCAM1', 'Plasma1_VEGF',
       'Plasma1_VEGFC', 'Plasma1_VEGFD']]

data_v1_meso.corr()['class'].sort_values()

In [None]:
#prepare features and remove na's
data_v1_meso_features = data_v1_meso[['Plasma1_VEGF', 'Plasma1_bFGF', 'Plasma1_MDC', 'Plasma1_IL7', 'Plasma1_IL1α', 'class']]
data_v1_meso_features.dropna(axis=0, inplace=True)
data_v1_meso_features.info()


In [None]:
#prepare train and test data
MESO_features = ['Plasma1_VEGF', 'Plasma1_bFGF', 'Plasma1_MDC', 'Plasma1_IL7', 'Plasma1_IL1α']

#split data
train, test = train_test_split(data_v1_meso_features, test_size = .3, random_state=1, stratify = data_v1_meso_features['class'])

Y_train_class = train['class']
Y_test_class = test['class']


X_train_full = train[MESO_features]

X_test_full = test[MESO_features]

In [None]:
#scale data algo
scaler = StandardScaler()

#k fold algo
strat_k_fold = StratifiedKFold(n_splits=10)

#classifier algos
dm_cv = DummyClassifier(strategy='stratified', random_state=39)
lr_cv = LogisticRegression(random_state=39, class_weight='balanced')
rf_cv = RandomForestClassifier(random_state=39, class_weight='balanced')
svm_cv = SVC(kernel='linear', probability=True, class_weight='balanced') 
knn_cv = KNeighborsClassifier()
#gb_cv = GradientBoostingClassifier(random_state=39)
ab_cv = AdaBoostClassifier(random_state=39)

#dic with classifier and feature importance attribute name
models_dic = {'dm_cv': (dm_cv, 'none'), 
              'lr_cv': (lr_cv, 'coef'), 
              'rf_cv': (rf_cv, 'feature_importance'), 
              'svm_cv':(svm_cv, 'coef'), 
              'knn_cv': (knn_cv, 'none'),  
              'ab_cv': (ab_cv, 'feature_importance')}

#'gb_cv': (gb_cv, 'feature_importance'),

In [None]:
def feature_importance(X, y, model_instance, feature_names, fi_name):
    #takes in features (X) and classess (y), model, column names for features in X, and name of attribute for feature importance
    #returns dictionary of feature names and coef/feature importance values
    
    feature_importance_dic = {}
    
    model_instance.fit(X, y)
    
    if fi_name == 'coef':
        coef = model_instance.coef_[0]
        feature_importance_dic = dict(zip(feature_names, coef))
    if fi_name == 'feature_importance':
        coef = model_instance.feature_importances_
        feature_importance_dic = dict(zip(feature_names, coef))
    if fi_name == 'none':
        coef = np.zeros(len(feature_names))
        feature_importance_dic = dict(zip(feature_names, coef))
    
    return feature_importance_dic

In [None]:
def classification_pipeline(X, y, cv_instance, model_instance, feature_names, fi_name):
    
    #scale data
    data_scaled = scaler.fit_transform(X)
    
    #generate cross-val sets
    cv = list(cv_instance.split(data_scaled, y))
    
    #predict class and predict probability 
    y_pred = cross_val_predict(model_instance, data_scaled, y, cv=cv, method='predict')
    y_pred_prob = cross_val_predict(model_instance, data_scaled, y, cv=cv, method='predict_proba')
    
    #generate confusion matrix
    conf_mat = confusion_matrix(y, y_pred)
    print('Confusion matrix:', conf_mat)
    
    #generate ROC_AUC
    ROC_AUC = metrics.roc_auc_score(y, y_pred_prob[:,1])
    print("ROC_AUC: ", ROC_AUC)
    
    # generate additional metrics
    recall = metrics.recall_score(y,y_pred)
    precision = metrics.precision_score(y,y_pred)
    accuracy = metrics.accuracy_score(y,y_pred)
    F1 = metrics.f1_score(y,y_pred)
    print("Sensitivity/Recall (TPR): ",recall)
    print("Precision (PPV): ", precision)
    print("Accuracy: ", accuracy)
    print("F1:", F1)
    
    #determine feature importance
    feature_dic = feature_importance(data_scaled, y, model_instance, feature_names, fi_name)
    
    #create dic
    data_dic = {}
    data_dic['y_pred'] = y_pred
    data_dic['y_pred_prob'] = y_pred_prob
    data_dic['conf_mat'] = conf_mat
    data_dic['ROC_AUC'] = ROC_AUC
    data_dic['recall'] = recall
    data_dic['precision'] = precision
    data_dic['accuracy'] = accuracy
    data_dic['F1'] = F1
    
    data_dic = {**data_dic, **feature_dic}
    
    return data_dic

In [None]:
feature_set = 'MESO_features'
feature_names = MESO_features

data_full_features = {}

for name, model in models_dic.items():
    print(f'{name} model with {feature_set} features:')
    data_full_features[name + '_' + feature_set] = classification_pipeline(X_train_full, Y_train_class, strat_k_fold, model[0], feature_names, model[1])
    print('\n')

In [None]:
#put dics in pandas df 
final_dic = {**data_full_features}
data_pandas = pd.DataFrame.from_dict(data = final_dic, orient='index')
data_pandas.sort_values('F1', ascending=False).head()

### Does MESO at visit 1 predict recovery (or not) trajectory (at visit 2)

In [None]:
#use only TBI participants
data_TBI =  data[data['Group'] == 'T']
print(data_TBI.shape)
#only interested in first and second visit
visits = [1,2]
data_TBI_v12_all = data_TBI[data_TBI['VisitSeq'].isin(visits)]
print(data_TBI_v12_all.shape)

#get TBIIDs of participants with a second visit and use to filter 
TBIID_v2 = data_TBI_v12_all[data_TBI_v12_all['VisitSeq'] == 2]['TBIID'].unique()
print(len(TBIID_v2))

#make new df with only TBI participants who returned for visit seq 2
data_TBI_v12_only = data_TBI_v12_all[data_TBI_v12_all['TBIID'].isin(TBIID_v2)]
print(data_TBI_v12_only.shape)
print(data_TBI_v12_only['VisitSeq'].value_counts())

data_TBI_v12_only.head()

In [None]:
#make new df with param score diff from v1 to v2 - need to drop na's and find TBIIDs in common 

data_TBI_v12_only = data_TBI_v12_only[['TBIID', 'VisitSeq',
       'BMI', 'BPSYS', 'BPDIAS', 'HRATE',  
       'PSQItot', 'PCLTot', 'PHQTot', 
       'NSITot', 'TBITot', 'auditc']]

print(data_TBI_v12_only.shape)
print(data_TBI_v12_only['VisitSeq'].value_counts())

data_TBI_v1 = data_TBI_v12_only[data_TBI_v12_only['VisitSeq'] == 1].dropna(axis=0)
print(data_TBI_v1.shape)

data_TBI_v2 = data_TBI_v12_only[data_TBI_v12_only['VisitSeq'] == 2].dropna(axis=0)
print(data_TBI_v2.shape)

#match TBIIDs with no nas
TBIID_keep = list(set(data_TBI_v2['TBIID']) & set(data_TBI_v1['TBIID']))
data_TBI_v1 = data_TBI_v1[data_TBI_v1['TBIID'].isin(TBIID_keep)].drop('VisitSeq', axis=1).set_index('TBIID')
print(data_TBI_v1.shape)
data_TBI_v2 = data_TBI_v2[data_TBI_v2['TBIID'].isin(TBIID_keep)].drop('VisitSeq', axis=1).set_index('TBIID')
print(data_TBI_v2.shape)

data_TBI_diff = data_TBI_v2 - data_TBI_v1
data_TBI_diff.head()

In [None]:
#add MESO data 
data_TBI_MESO = data_TBI[(data_TBI['TBIID'].isin(TBIID_keep)) & (data_TBI['VisitSeq'] == 1)][['TBIID', 
                                                                                              'Plasma1_bFGF', 
                                                                                              'Plasma1_VEGF', 
                                                                                              'Plasma1_MDC',
                                                                                              'Plasma1_IL1α']]
print(data_TBI_MESO.shape)

#combine dfs
data_TBI_diff_MESO = pd.merge(data_TBI_diff, data_TBI_MESO, how='left', on=['TBIID'], sort=False)
print(data_TBI_diff_MESO.shape)

#drop params with many missing values
data_TBI_diff_MESO.dropna(axis=0, inplace=True)
print(data_TBI_diff_MESO.shape)
data_TBI_diff_MESO.head()

In [None]:
#explore corr with MESO analytes of interest
data_TBI_diff_MESO.corr()[['Plasma1_bFGF', 'Plasma1_VEGF', 'Plasma1_MDC','Plasma1_IL1α']]

In [None]:
sns.regplot(x='Plasma1_IL1α', y='NSITot', data=data_TBI_diff_MESO)