In [8]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

In [9]:
# Importing adnimerge data
adnimerge = pd.read_csv('SourceData/adnimerge_dataR.csv', low_memory=False)

## Filtering the data using 2 conditions.
# {a} Baseline protein expression data
adnimerge = adnimerge[adnimerge['VISCODE'].str.contains('bl', case=False, na=False)]
print(adnimerge.shape)

# {b} Only control and AD data, removing other conditions
adnimerge = adnimerge[adnimerge['DX.bl'].str.contains('CN|AD|MCI', case=False, na=False)]
print(adnimerge.shape)

adnimerge

(2432, 115)
(2069, 115)


Unnamed: 0,RID,COLPROT,ORIGPROT,PTID,SITE,VISCODE,EXAMDATE,DX.bl,AGE,PTGENDER,...,TAU.bl,PTAU.bl,FDG.bl,PIB.bl,AV45.bl,FBB.bl,Years.bl,Month.bl,Month,M
0,2.0,ADNI1,ADNI1,011_S_0002,11.0,bl,9/8/05,CN,74.3,Male,...,,,1.336154,,,,0.0,0.0,0.0,0.0
16,3.0,ADNI1,ADNI1,011_S_0003,11.0,bl,9/12/05,AD,81.3,Male,...,239.7,22.83,1.108605,,,,0.0,0.0,0.0,0.0
21,4.0,ADNI1,ADNI1,022_S_0004,22.0,bl,11/8/05,LMCI,67.5,Male,...,153.1,13.29,,,,,0.0,0.0,0.0,0.0
27,5.0,ADNI1,ADNI1,011_S_0005,11.0,bl,9/7/05,CN,73.7,Male,...,337,33.43,1.259559,,,,0.0,0.0,0.0,0.0
34,6.0,ADNI1,ADNI1,100_S_0006,100.0,bl,11/29/05,LMCI,80.4,Female,...,,,,,,,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16448,7105.0,ADNI3,ADNI3,035_S_7105,35.0,bl,10/20/22,LMCI,77.5,Male,...,,,,,,,0.0,0.0,0.0,0.0
16451,7117.0,ADNI3,ADNI3,082_S_7117,82.0,bl,1/12/23,EMCI,68.9,Female,...,,,,,,,0.0,0.0,0.0,0.0
16452,7121.0,ADNI3,ADNI3,035_S_7121,35.0,bl,3/29/23,LMCI,62.1,Male,...,,,,,,,0.0,0.0,0.0,0.0
16453,7122.0,ADNI3,ADNI3,082_S_7122,82.0,bl,4/10/23,CN,67.3,Female,...,,,,,,,0.0,0.0,0.0,0.0


In [10]:
# Import and preprocess analyte expression data
ana_exp = pd.read_csv('SourceData/ADNI_Analyte_expression_data.csv', low_memory=False)

# Only considering baseline data
ana_exp = ana_exp[ana_exp['VISCODE2'].str.contains('bl', na=False)]

# Merge expression data with baseline ADNI data
ana_exp = pd.merge(ana_exp,adnimerge[['RID','DX.bl','AGE','PTGENDER','PTEDUCAT','APOE4']])

ana_exp

Unnamed: 0,RID,EXAMDATE,GUSPECID,Somalogic_Barcode_A,VISCODE2,ExtIdentifier,PlateId,X10000.28,X10001.7,X10003.15,...,X9993.11,X9994.217,X9995.6,X9997.12,X9999.1,DX.bl,AGE,PTGENDER,PTEDUCAT,APOE4
0,4521,3/2/2012,GA808GNH-05,383704459,bl,EXID40000005903598,P0029625,155.6,215.5,155.1,...,167.5,322.2,565.3,343.8,198.7,LMCI,70.0,Male,18.0,1.0
1,4488,2/23/2012,EA808GPZ-05,383704460,bl,EXID40000005905024,P0029626,147.8,171.7,150.1,...,185.1,330.9,484.5,243.6,197.7,CN,72.6,Male,18.0,0.0
2,4516,2/28/2012,AA808GRC-05,383704461,bl,EXID40000005944506,P0029652,,181.6,139.8,...,216.0,328.7,509.9,241.0,208.0,CN,71.4,Male,20.0,0.0
3,4530,2/22/2012,FA808GSR-05,383704462,bl,EXID40000005906429,P0029631,159.1,174.0,159.7,...,184.8,347.7,560.3,402.7,161.7,EMCI,75.9,Male,18.0,1.0
4,4462,2/28/2012,JA808GV2-04,383704463,bl,EXID40000005905670,P0029632,143.6,179.1,150.7,...,184.0,340.4,494.4,266.1,198.5,LMCI,71.3,Male,18.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
703,886,10/6/2006,FA801RWB-10,383730654,bl,EXID40000005910413,P0029648,146.5,180.1,151.8,...,197.2,331.4,432.2,195.7,161.5,CN,71.3,Female,15.0,0.0
704,336,6/1/2006,GA801F0Q-11,383730655,bl,EXID40000005905548,P0029633,148.2,189.4,149.7,...,183.2,315.9,426.9,222.3,148.7,LMCI,76.1,Male,17.0,1.0
705,588,12/19/2006,FA801STM-14,383730656,bl,EXID40000005941074,P0029660,160.0,184.6,155.2,...,182.1,337.0,473.1,233.9,160.2,LMCI,64.6,Male,14.0,2.0
706,973,12/11/2006,JA8019L2-11,383730658,bl,EXID40000005943506,P0029659,142.5,166.9,143.1,...,194.1,338.9,516.2,369.4,189.3,LMCI,76.8,Male,20.0,0.0


In [11]:
# Check the unique values in diagnosis column
set(list(ana_exp['DX.bl']))

# Replace EMCI and LMCI with MCI
ana_exp['DX.bl'].replace({'EMCI': 'MCI','LMCI': 'MCI'}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ana_exp['DX.bl'].replace({'EMCI': 'MCI','LMCI': 'MCI'}, inplace=True)


In [12]:
# Remove the proteins with more than 10% missing expression values
ana_exp = ana_exp.dropna(thresh=len(ana_exp) * 0.9, axis=1)
ana_exp

Unnamed: 0,RID,EXAMDATE,GUSPECID,Somalogic_Barcode_A,VISCODE2,ExtIdentifier,PlateId,X10000.28,X10001.7,X10003.15,...,X9993.11,X9994.217,X9995.6,X9997.12,X9999.1,DX.bl,AGE,PTGENDER,PTEDUCAT,APOE4
0,4521,3/2/2012,GA808GNH-05,383704459,bl,EXID40000005903598,P0029625,155.6,215.5,155.1,...,167.5,322.2,565.3,343.8,198.7,MCI,70.0,Male,18.0,1.0
1,4488,2/23/2012,EA808GPZ-05,383704460,bl,EXID40000005905024,P0029626,147.8,171.7,150.1,...,185.1,330.9,484.5,243.6,197.7,CN,72.6,Male,18.0,0.0
2,4516,2/28/2012,AA808GRC-05,383704461,bl,EXID40000005944506,P0029652,,181.6,139.8,...,216.0,328.7,509.9,241.0,208.0,CN,71.4,Male,20.0,0.0
3,4530,2/22/2012,FA808GSR-05,383704462,bl,EXID40000005906429,P0029631,159.1,174.0,159.7,...,184.8,347.7,560.3,402.7,161.7,MCI,75.9,Male,18.0,1.0
4,4462,2/28/2012,JA808GV2-04,383704463,bl,EXID40000005905670,P0029632,143.6,179.1,150.7,...,184.0,340.4,494.4,266.1,198.5,MCI,71.3,Male,18.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
703,886,10/6/2006,FA801RWB-10,383730654,bl,EXID40000005910413,P0029648,146.5,180.1,151.8,...,197.2,331.4,432.2,195.7,161.5,CN,71.3,Female,15.0,0.0
704,336,6/1/2006,GA801F0Q-11,383730655,bl,EXID40000005905548,P0029633,148.2,189.4,149.7,...,183.2,315.9,426.9,222.3,148.7,MCI,76.1,Male,17.0,1.0
705,588,12/19/2006,FA801STM-14,383730656,bl,EXID40000005941074,P0029660,160.0,184.6,155.2,...,182.1,337.0,473.1,233.9,160.2,MCI,64.6,Male,14.0,2.0
706,973,12/11/2006,JA8019L2-11,383730658,bl,EXID40000005943506,P0029659,142.5,166.9,143.1,...,194.1,338.9,516.2,369.4,189.3,MCI,76.8,Male,20.0,0.0


In [13]:
## Transforming, normalizing and standardizing the data
protein_cols = [col for col in ana_exp.columns if col.startswith('X')]

# Apply natural log transformation first
ana_exp[protein_cols] = ana_exp[protein_cols].apply(np.log)

# Z-transform after log transformation
ana_exp[protein_cols] = ana_exp[protein_cols].apply(lambda x: (x - x.mean()) / x.std())

# Drop non-essential columns
exp_data_final = ana_exp.drop(['EXAMDATE', 'GUSPECID', 'Somalogic_Barcode_A',
                                'VISCODE2', 'ExtIdentifier', 'PlateId'], axis=1)
exp_data_final

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ana_exp[protein_cols] = ana_exp[protein_cols].apply(np.log)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ana_exp[protein_cols] = ana_exp[protein_cols].apply(lambda x: (x - x.mean()) / x.std())


Unnamed: 0,RID,X10000.28,X10001.7,X10003.15,X10006.25,X10008.43,X10010.10,X10011.65,X10012.5,X10014.31,...,X9993.11,X9994.217,X9995.6,X9997.12,X9999.1,DX.bl,AGE,PTGENDER,PTEDUCAT,APOE4
0,4521,1.310421,1.564750,-0.029270,0.338239,0.172844,-0.062360,0.199556,-0.595943,-1.014925,...,-1.244610,-1.221278,1.194591,1.431490,0.633302,MCI,70.0,Male,18.0,1.0
1,4488,-0.053928,-0.733068,-0.759819,1.549274,0.377533,1.200031,0.822422,-0.083854,0.388234,...,0.087784,-0.332293,-0.217651,-0.599938,0.599571,CN,72.6,Male,18.0,0.0
2,4516,,-0.166151,-2.344702,0.689727,-2.226109,1.157316,-0.185695,1.190070,-2.001990,...,2.146556,-0.554866,0.250204,-0.663208,0.939106,CN,71.4,Male,20.0,0.0
3,4530,1.900540,-0.598498,0.622327,-0.639343,1.158421,-0.577022,-0.071078,-0.414963,-1.403547,...,0.066153,1.320099,1.113245,2.363864,-0.744252,MCI,75.9,Male,18.0,1.0
4,4462,-0.818714,-0.306341,-0.670879,1.981747,-0.977259,-2.196091,0.060342,-0.721918,-0.857816,...,0.008298,0.612127,-0.032445,-0.079042,0.626569,MCI,71.3,Male,18.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
703,886,-0.288300,-0.250031,-0.508737,0.207056,1.307359,-1.145944,0.311941,1.953338,0.716846,...,0.932221,-0.281914,-1.263553,-1.890876,-0.752526,CN,71.3,Female,15.0,0.0
704,336,0.017772,0.259152,-0.819311,1.267136,-0.318669,0.202924,1.499183,,1.190650,...,-0.049809,-1.880140,-1.376528,-1.139437,-1.304570,MCI,76.1,Male,17.0,1.0
705,588,2.050187,-0.000450,-0.014900,0.689727,-1.110639,-0.017994,0.199556,0.900113,0.098166,...,-0.130122,0.277187,-0.435666,-0.839523,-0.806559,MCI,64.6,Male,14.0,2.0
706,973,-1.022713,-1.019814,-1.824555,0.708044,0.555556,1.959357,0.457937,1.346986,-0.222290,...,0.720920,0.464774,0.362639,1.854953,0.309305,MCI,76.8,Male,20.0,0.0


In [14]:
# Replacing the remaining NaN expression values with the average of the column
exp_data_final = exp_data_final.fillna(exp_data_final.drop(['RID','DX.bl','AGE','PTGENDER','PTEDUCAT','APOE4'], axis=1).mean())

exp_data_final

Unnamed: 0,RID,X10000.28,X10001.7,X10003.15,X10006.25,X10008.43,X10010.10,X10011.65,X10012.5,X10014.31,...,X9993.11,X9994.217,X9995.6,X9997.12,X9999.1,DX.bl,AGE,PTGENDER,PTEDUCAT,APOE4
0,4521,1.310421e+00,1.564750,-2.927004e-02,0.338239,0.172844,-0.062360,0.199556,-5.959430e-01,-1.014925,...,-1.244610,-1.221278,1.194591,1.431490,0.633302,MCI,70.0,Male,18.0,1.0
1,4488,-5.392761e-02,-0.733068,-7.598195e-01,1.549274,0.377533,1.200031,0.822422,-8.385415e-02,0.388234,...,0.087784,-0.332293,-0.217651,-0.599938,0.599571,CN,72.6,Male,18.0,0.0
2,4516,-5.883357e-15,-0.166151,-2.344702e+00,0.689727,-2.226109,1.157316,-0.185695,1.190070e+00,-2.001990,...,2.146556,-0.554866,0.250204,-0.663208,0.939106,CN,71.4,Male,20.0,0.0
3,4530,1.900540e+00,-0.598498,6.223273e-01,-0.639343,1.158421,-0.577022,-0.071078,-4.149629e-01,-1.403547,...,0.066153,1.320099,1.113245,2.363864,-0.744252,MCI,75.9,Male,18.0,1.0
4,4462,-8.187144e-01,-0.306341,-6.708790e-01,1.981747,-0.977259,-2.196091,0.060342,-7.219177e-01,-0.857816,...,0.008298,0.612127,-0.032445,-0.079042,0.626569,MCI,71.3,Male,18.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
703,886,-2.883000e-01,-0.250031,-5.087374e-01,0.207056,1.307359,-1.145944,0.311941,1.953338e+00,0.716846,...,0.932221,-0.281914,-1.263553,-1.890876,-0.752526,CN,71.3,Female,15.0,0.0
704,336,1.777239e-02,0.259152,-8.193108e-01,1.267136,-0.318669,0.202924,1.499183,1.859116e-14,1.190650,...,-0.049809,-1.880140,-1.376528,-1.139437,-1.304570,MCI,76.1,Male,17.0,1.0
705,588,2.050187e+00,-0.000450,-1.490048e-02,0.689727,-1.110639,-0.017994,0.199556,9.001130e-01,0.098166,...,-0.130122,0.277187,-0.435666,-0.839523,-0.806559,MCI,64.6,Male,14.0,2.0
706,973,-1.022713e+00,-1.019814,-1.824555e+00,0.708044,0.555556,1.959357,0.457937,1.346986e+00,-0.222290,...,0.720920,0.464774,0.362639,1.854953,0.309305,MCI,76.8,Male,20.0,0.0


In [15]:
exp_data_final.to_csv('SourceData/prot_exp_data.csv',index=False)

## Lasso Regression

In [16]:
exp_data_final[['AGE','PTGENDER','PTEDUCAT','APOE4','DX.bl']]
exp_data = exp_data_final[exp_data_final['DX.bl'].str.contains('CN|AD', case=False, na=False)]
exp_data

Unnamed: 0,RID,X10000.28,X10001.7,X10003.15,X10006.25,X10008.43,X10010.10,X10011.65,X10012.5,X10014.31,...,X9993.11,X9994.217,X9995.6,X9997.12,X9999.1,DX.bl,AGE,PTGENDER,PTEDUCAT,APOE4
1,4488,-5.392761e-02,-0.733068,-7.598195e-01,1.549274,0.377533,1.200031,8.224223e-01,-8.385415e-02,3.882343e-01,...,0.087784,-0.332293,-2.176513e-01,-5.999381e-01,0.599571,CN,72.6,Male,18.0,0.0
2,4516,-5.883357e-15,-0.166151,-2.344702e+00,0.689727,-2.226109,1.157316,-1.856949e-01,1.190070e+00,-2.001990e+00,...,2.146556,-0.554866,2.502040e-01,-6.632077e-01,0.939106,CN,71.4,Male,20.0,0.0
6,4464,8.927912e-02,-0.368645,1.013313e-14,1.461564,-1.217750,-0.712648,9.229370e-01,1.859116e-14,9.893535e-15,...,-0.569346,-2.283933,-4.113939e-15,-2.333899e-15,0.305773,CN,70.4,Male,16.0,1.0
8,4376,-5.392761e-02,0.692898,-7.301332e-01,0.188240,-2.728536,-1.678399,-1.146246e+00,5.669726e-01,-2.164290e+00,...,-0.196195,1.300902,1.850781e+00,1.704611e+00,0.893956,CN,76.5,Female,16.0,0.0
11,4900,1.249603e-01,0.381222,-1.301171e-01,0.282130,1.307359,-1.655064,5.285869e-15,-1.060616e+00,-1.621314e+00,...,-0.130122,-0.453512,-2.460426e-01,-1.480668e+00,-1.255297,CN,59.8,Female,18.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,459,2.264864e+00,0.950474,1.498339e+00,-1.097550,0.883534,0.487864,-1.436327e+00,-7.780868e-01,-2.222898e-01,...,1.246328,-1.543854,4.121701e-01,7.292964e-01,0.410947,CN,72.9,Male,16.0,0.0
699,321,2.396115e+00,0.804815,2.002638e+00,-0.541205,2.426492,1.263998,-1.072081e+00,-1.678248e-03,8.392906e-01,...,0.238227,-0.646348,-1.099783e+00,-1.249199e+00,-1.519290,AD,68.3,Male,18.0,2.0
702,866,-1.247061e+00,-0.482918,-1.149382e+00,1.601704,1.307359,-1.771907,2.860630e-01,2.182162e+00,-3.906470e-01,...,-0.093557,-0.605658,-2.176513e-01,-9.105316e-01,-0.417479,CN,80.0,Female,13.0,0.0
703,886,-2.883000e-01,-0.250031,-5.087374e-01,0.207056,1.307359,-1.145944,3.119406e-01,1.953338e+00,7.168462e-01,...,0.932221,-0.281914,-1.263553e+00,-1.890876e+00,-0.752526,CN,71.3,Female,15.0,0.0


In [17]:
exp_data['PTGENDER'] = exp_data['PTGENDER'].map({'Male': 1, 'Female': 0})
exp_data['DX.bl'] = exp_data['DX.bl'].map({'AD': 1, 'CN': 0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exp_data['PTGENDER'] = exp_data['PTGENDER'].map({'Male': 1, 'Female': 0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exp_data['DX.bl'] = exp_data['DX.bl'].map({'AD': 1, 'CN': 0})


In [18]:
X = exp_data[['AGE','PTGENDER','PTEDUCAT','APOE4','X4179.57','X10479.18','X19229.92', 'X9900.36', 'X12414.31', 'X10082.251', 'X4272.46', 'X14157.21', 'X16885.49', 'X24637.3', 'X8479.4', 'X19112.2', 'X15612.5', 'X5858.6', 'X12853.112', 'X4903.72','X8070.88', 'X6521.35', 'X8997.4', 'X9266.1', 'X19243.2', 'X20956.13', 'X22430.15', 'X16288.17', 'X25219.17', 'X24474.12','X15511.37', 'X3351.1','X15474.7', 'X12358.6', 'X18226.148', 'X12387.7', 'X22049.24', 'X12861.13', 'X21173.25', 'X22981.3','X15545.13', 'X10351.51', 'X14156.33', 'X21351.8', 'X13388.57', 'X14675.20', 'X9884.8', 'X24717.26', 'X23677.30', 'X3115.64', 'X21154.8', 'X11681.8', 'X5109.24', 'X4230.1','X16892.23', 'X6361.49', 'X19620.16', 'X21140.19', 'X5688.65', 'X8235.48', 'X20175.17', 'X3600.2', 'X14615.46', 'X17325.10', 'X5694.57', 'X3503.4', 'X9599.6', 'X21721.6', 'X8028.22', 'X10455.196', 'X17366.6', 'X6918.183', 'X15333.11', 'X4455.89']].values

X = exp_data[['AGE','PTGENDER','PTEDUCAT','APOE4']]
y = exp_data['DX.bl'].values

# Lasso
model = LogisticRegression(penalty='l1', solver='liblinear', C=10)

cv = StratifiedKFold(n_splits=5)
aucs = []

for train_idx, test_idx in cv.split(X, y):
    model.fit(X[train_idx], y[train_idx])
    y_prob = model.predict_proba(X[test_idx])[:, 1]
    auc_score = roc_auc_score(y[test_idx], y_prob)
    aucs.append(auc_score)

KeyError: "None of [Index([ 57,  62,  63,  64,  65,  66,  67,  68,  69,  70,\n       ...\n       295, 296, 297, 298, 299, 300, 301, 302, 303, 304],\n      dtype='int64', length=244)] are in the [columns]"

In [31]:
model.coef_

array([[-0.02483039,  0.        ,  0.07657738,  0.        ,  4.92419728,
         0.        ,  0.28208584, -0.46998779,  1.66829591,  2.22493609,
         0.        ,  0.34658492,  0.        ,  0.        ,  1.85169506,
         1.20989031,  0.        ,  0.        , -1.6491946 ,  0.        ,
         3.89164191, -0.26600531, -0.76252571,  0.69720352,  0.        ,
         0.        ,  0.        , -0.8641312 , -1.06161357,  0.04088091,
         0.        ,  2.24736453,  0.74811433,  1.74938545,  0.        ,
        -0.75789114,  0.        , -0.62026807,  2.17300037,  0.61375876,
         0.        , -2.42338691,  0.        ,  0.        ,  0.        ,
         0.        ,  0.48559816,  0.        , -0.69944131, -0.95102519,
        -1.36410134, -0.20708   ,  1.80839129, -0.79852989,  1.13150153,
        -3.72555681,  0.        ,  2.11644297, -0.52689764,  0.        ,
         0.        , -1.00432744, -3.58194398,  0.        ,  0.        ,
        -0.5174691 ,  0.        , -1.06320033, -0.0

In [32]:
print("AUC scores:", aucs)
print("Mean AUC:", np.mean(aucs))

AUC scores: [0.9673202614379084, 0.9673202614379084, 0.9393939393939394, 0.9588744588744589, 0.9686147186147186]
Mean AUC: 0.9603047279517867
