In [32]:
import os
import pandas as pd
import pickle 

ROOT_DIR = os.getcwd()


### McGill Proteins

In [26]:
TRAIN_PROT_LIST = os.path.join(ROOT_DIR, 'data', 'Somalogic_list_QC1.txt')
with open(TRAIN_PROT_LIST) as f:
    protein = f.readlines()

mcgill_prot_list = [x.strip() for x in protein]
print(mcgill_prot_list[:4])  # print first 4 proteins
print(f"\nMcGill num. of proteins: {len(mcgill_prot_list)}")

['CRYBB2.10000.28', 'RAF1.10001.7', 'ZNF41.10003.15', 'ELK1.10006.25']

McGill num. of proteins: 4984


### Mt. Sinai Proteins

In [27]:
TEST_PROT_LIST = os.path.join(ROOT_DIR, 'data', 'mssm_protein_list.csv')
test_sum_stats = pd.read_csv(TEST_PROT_LIST, low_memory=False)
mssm_prot_list = test_sum_stats['c'].tolist()
print(mssm_prot_list[:4])
print(f"\nMt. Sinai num. of proteins: {len(mssm_prot_list)}")

['CRYBB2.10000.28', 'RAF1.10001.7', 'ZNF41.10003.15', 'ELK1.10006.25']

Mt. Sinai num. of proteins: 4695


In [16]:
common_prot = list(set(mcgill_prot_list).intersection(mssm_prot_list))
print(f"Common proteins in McGill and Mt. Sinai proteins: {len(common_prot)}")

Common proteins in McGill and Mt. Sinai proteins: 4663


In [29]:
prot_in_mssm_not_in_intersect = [item for item in mssm_prot_list if item not in common_prot]
print(f"\nNum. of Mt. Sinai proteins not present in the intersection with McGill proteins: {len(prot_in_mssm_not_in_intersect)}")

prot_in_mssm_not_in_intersect


Num. of Mt. Sinai proteins not present in the intersection with McGill proteins: 32


['ERVV.1.12531.5',
 'KRTAP2.4.14615.46',
 'GFP.16535.61',
 'HCE001796.2171.12',
 'HCE003167.2178.55',
 'HCE000414.2194.91',
 'HCE003183.2229.54',
 'HCE004333.2249.25',
 'HCE004359.2273.34',
 'HCE004331.2288.7',
 'HCE003300.2305.52',
 'HCE000483.2312.13',
 'HCE004152.2359.65',
 'HCE000342.2430.52',
 'HCE000104.2513.7',
 'IGHG1.IGHG2.IGHG3.IGHG4.IGK@.IGL@.2744.57',
 'Human.virus.2769.3',
 'IGHM.IGJ.IGK@.IGL@.3069.52',
 'mdh.3507.1',
 'nodH.3721.5',
 'GFP.3849.56',
 'MELT.4584.5',
 'Human.virus.4792.51',
 'IGHD.IGK@.IGL@.4916.2',
 'non.human.8443.9',
 'magainins.8444.3',
 'magainins.8444.46',
 'MELT.8445.184',
 'MELT.8445.54',
 'apcA.apcB.8471.53',
 'mdh.8481.26',
 'mdh.8481.44']

In [19]:
len(list(set(mcgill_prot_list)))

4984

### Checking model variables


In [36]:
FINAL_MODEL_DIR = os.path.join(ROOT_DIR, 'results', 'models', 'final')
soma_data = 'normalized'
nat_log_transf = 'True'
standardize = 'True'

### Infe A2 

In [78]:
data = 'infe'
outcome = 'A2'

In [79]:
X_choice = 'baseline'

model_coef_file = f'{FINAL_MODEL_DIR}/{X_choice}-soma_data={soma_data}-nat_log_transf={nat_log_transf}-standardize={standardize}_{data}_{outcome}_coef.pkl'
model_coef = pickle.load(open(model_coef_file, 'rb'))
model_coef

{'age_at_diagnosis': 0.008316909126226835,
 'sex_M': 0.4429685557483413,
 'ProcessTime': -0.034606220285538405,
 'SampleGroup': -0.34074553570558064}

In [80]:
X_choice = 'all_proteins'

model_coef_file = f'{FINAL_MODEL_DIR}/{X_choice}-soma_data={soma_data}-nat_log_transf={nat_log_transf}-standardize={standardize}_{data}_{outcome}_coef.pkl'
model_coef = pickle.load(open(model_coef_file, 'rb'))
print(f"Num. of nonzero coefficients: {len(model_coef)}")
model_coef

Num. of nonzero coefficients: 69


{'age_at_diagnosis': -0.028010451631391712,
 'ProcessTime': -0.013517750413849242,
 'BRD4.10043.31': 0.18724844733266266,
 'CBS.10086.39': -0.08428213248506333,
 'ZNRF3.10390.21': -0.010210969864161862,
 'PSMB6.10530.8': 0.01857128228956861,
 'CHI3L1.11104.13': 0.013348033555985355,
 'CCDC64.11158.40': 0.04724104778792251,
 'NAGPA.11208.15': -0.03553800941896891,
 'TNR.11302.237': -0.11657627610329259,
 'KRT7.11383.41': 0.25174161262805245,
 'SETMAR.12462.20': 0.07157894672087094,
 'ZNF134.12787.47': -0.07727172111392956,
 'RELL1.13399.33': -0.0680972531579681,
 'PTH1R.13470.43': 0.09375837768184384,
 'SLC5A8.13691.10': -0.10089457365003755,
 'RAP1GAP.13735.1': -0.05767067153590051,
 'DAPK1.13955.33': 0.016414965225371704,
 'SLC26A7.13979.3': 0.027914361037750756,
 'IFNA7.14129.1': 0.15475623937602687,
 'APOC1.15364.101': -0.034580717860192706,
 'PLTP.15475.4': -0.057190334776238416,
 'CLSTN1.15521.4': -0.0017604815280184143,
 'NID2.16060.99': -0.0917440019953603,
 'RAB3A.17516.7': 0.0

In [81]:
coefficients = list(model_coef.keys())

# leave only proteins
coefficients = [coef for coef in coefficients if coef not in ['age_at_diagnosis', 'sex', 'ProcessTime', 'SampleGroup']]

# list of proteins in model that isn't in Mt. Sinai proteins
[coef for coef in coefficients if coef not in common_prot]

['SLC5A8.13691.10', 'RAB3A.17516.7']

### Infe A3

In [82]:
data = 'infe'
outcome = 'A3'

In [83]:
X_choice = 'baseline'

model_coef_file = f'{FINAL_MODEL_DIR}/{X_choice}-soma_data={soma_data}-nat_log_transf={nat_log_transf}-standardize={standardize}_{data}_{outcome}_coef.pkl'
model_coef = pickle.load(open(model_coef_file, 'rb'))
model_coef

{'age_at_diagnosis': 0.016121598272174888,
 'ProcessTime': -0.030220663981670053,
 'SampleGroup': -0.5221710430717814}

In [84]:
X_choice = 'all_proteins'

model_coef_file = f'{FINAL_MODEL_DIR}/{X_choice}-soma_data={soma_data}-nat_log_transf={nat_log_transf}-standardize={standardize}_{data}_{outcome}_coef.pkl'
model_coef = pickle.load(open(model_coef_file, 'rb'))
print(f"Num. of nonzero coefficients: {len(model_coef)}")
model_coef

Num. of nonzero coefficients: 60


{'age_at_diagnosis': 0.006405440081076953,
 'ProcessTime': -0.02506519686881692,
 'MCL1.10396.6': 0.06202957629493103,
 'PLOD3.10612.18': 0.05858899371898752,
 'GCH1.11185.145': 0.05125036818928842,
 'XDH.11264.33': -0.009661623887352535,
 'TEAD4.12516.13': 0.0207953006884026,
 'ILF3.12759.47': 0.039737168716948275,
 'ZNF334.12763.69': -0.019873170345061114,
 'SF1.12777.11': 0.0003402063353080685,
 'BAG4.12844.10': 0.06390214484270856,
 'PVRL3.13557.3': -0.008148267053820121,
 'RBL2.13565.2': 0.04650629783339722,
 'NADK.13624.17': 0.058913312092893654,
 'CSF1R.13682.47': -0.018302450030896377,
 'FARS2.13941.82': -0.04523178842836486,
 'AGAP3.13960.15': -0.04311789091375294,
 'MAGI2.14066.49': 0.008863659577250323,
 'PSMA4.14099.20': -0.08895092813376969,
 'PNOC.15434.5': -0.03406877463136182,
 'FBLN5.15585.304': 0.03869628536598177,
 'AKR1C3.17377.1': -0.0009247570392170661,
 'MRAS.18297.8': 0.014500933622310563,
 'AS3MT.18417.3': -0.0010340910735489937,
 'BLNK.19225.11': 0.03693503601

In [85]:
coefficients = list(model_coef.keys())

# leave only proteins
coefficients = [coef for coef in coefficients if coef not in ['age_at_diagnosis', 'sex', 'ProcessTime', 'SampleGroup']]

# list of proteins in model that isn't in Mt. Sinai proteins
[coef for coef in coefficients if coef not in common_prot]

[]