In [1]:
"""
Loads proteins from models and checks correlations of those proteins
"""

import pandas as pd
import os
import pickle
import time


ROOT_DIR = os.getcwd()  # .../somalogic
DAT_DIR = os.path.join(ROOT_DIR, 'results', 'datasets')  # .../somalogic/results/datasets
FINAL_MODEL_DIR = os.path.join(ROOT_DIR, 'results', 'models', 'final')
SEED = 0
PROT_LIST = os.path.join(ROOT_DIR, 'data', 'Somalogic_list_QC1.txt')
DAT_DIR = os.path.join(ROOT_DIR, 'results', 'datasets')  # .../somalogic/results/datasets

In [2]:
# get list of all proteins

with open(PROT_LIST) as f:
    protein = f.readlines()
prot_list = [x.strip() for x in protein]

file_path = DAT_DIR + '/' + 'infe_417-soma_data=normalized-nat_log_tranf=FALSE-standardize=FALSE-remove_outliers=FALSE.csv'
df = pd.read_csv(file_path, low_memory=False)
prot_df = df[prot_list]
print(prot_df.head())



   CRYBB2.10000.28  RAF1.10001.7  ZNF41.10003.15  ELK1.10006.25  \
0            640.3         334.2            94.3          773.6   
1            805.5         957.4           290.4         1088.5   
2            588.9         595.8            92.6          760.2   
3            470.7         522.7            91.2          923.1   
4            647.8         433.1           111.1          709.9   

   GUCA1A.10008.43  OCRL.10011.65  SPDEF.10012.5  SNAI2.10014.31  \
0            340.7        11418.6         1480.3           902.5   
1            450.7         1153.0         1179.6          1062.8   
2            322.6         1911.8         1452.7           824.5   
3            304.8         2412.0         1343.0           755.6   
4            311.6         1964.8         1694.3           919.8   

   KCNAB2.10015.119  POLH.10022.207  ...  KIAA1467.9981.18  PRSS35.9983.97  \
0             580.8           115.1  ...             389.3           295.7   
1             578.6           13

In [3]:
outcome = 'A2'

In [4]:
# get model proteins

a2_train_features_file = f'{FINAL_MODEL_DIR}/all_proteins-soma_data=normalized-nat_log_transf=True-standardize=True_infe_{outcome}_coef.pkl'
with open(a2_train_features_file, "rb") as fp:  # Unpickling
    a2_train_features = pickle.load(fp)

a2_features = [f for f in a2_train_features if
                             f not in ['age_at_diagnosis', 'sex_M', 'ProcessTime', 'SampleGroup']]

print(a2_features)
print(len(a2_features))

['BRD4.10043.31', 'CBS.10086.39', 'ZNRF3.10390.21', 'PSMB6.10530.8', 'CHI3L1.11104.13', 'CCDC64.11158.40', 'NAGPA.11208.15', 'TNR.11302.237', 'KRT7.11383.41', 'SETMAR.12462.20', 'ZNF134.12787.47', 'RELL1.13399.33', 'PTH1R.13470.43', 'SLC5A8.13691.10', 'RAP1GAP.13735.1', 'DAPK1.13955.33', 'SLC26A7.13979.3', 'IFNA7.14129.1', 'APOC1.15364.101', 'PLTP.15475.4', 'CLSTN1.15521.4', 'NID2.16060.99', 'RAB3A.17516.7', 'HSPH1.17704.74', 'ANXA13.17835.28', 'CCDC25.18264.12', 'COL2A1.18875.125', 'TK2.19114.8', 'GIMAP6.19302.7', 'SFTPD.19590.46', 'KIT.2475.1', 'MICA.2730.58', 'CDH5.2819.23', 'TIE1.2844.53', 'TBP.2875.15', 'CXCL13.3487.32', 'CKB.3800.71', 'CXCL10.4141.79', 'CCL7.4886.3', 'GP1BA.4990.87', 'PRDX5.5017.19', 'SIRT2.5030.52', 'STK17B.5249.31', 'TNNT2.5315.22', 'PRKCG.5476.66', 'PRSS37.5653.23', 'VASN.5682.13', 'TNXB.5698.60', 'IFNL3.5713.9', 'RNASE13.6424.2', 'PCYOX1.6431.68', 'CLEC6A.6911.103', 'PCDHGA12.6938.21', 'SEMA6C.7202.107', 'APLP1.7210.25', 'DNAJC4.8016.19', 'C18orf32.8236.8', '

In [5]:
# get correlations
start_time = time.time()

a2_corr = prot_df.corr(method='spearman')

print("--- %s seconds ---" % (time.time() - start_time))

--- 87.68095541000366 seconds ---


In [8]:
for i in range(len(a2_corr)):
    a2_corr.iloc[i, i] = 0  # set correlations for same protein to 0 (i.e. on diagonal)
a2_corr

Unnamed: 0,CRYBB2.10000.28,RAF1.10001.7,ZNF41.10003.15,ELK1.10006.25,GUCA1A.10008.43,OCRL.10011.65,SPDEF.10012.5,SNAI2.10014.31,KCNAB2.10015.119,POLH.10022.207,...,KIAA1467.9981.18,PRSS35.9983.97,YIPF6.9984.12,NPW.9986.14,LRRC24.9989.12,ZNF264.9993.11,ATP4B.9994.217,DUT.9995.6,UBXN4.9997.12,IRF6.9999.1
CRYBB2.10000.28,0.000000,-0.144330,0.027489,-0.235501,0.046085,-0.102753,0.449100,0.281074,0.213449,-0.227891,...,-0.172170,-0.038048,0.267035,-0.244359,0.083656,0.500784,0.420562,0.148778,-0.112835,-0.224992
RAF1.10001.7,-0.144330,0.000000,-0.383724,0.248620,-0.067600,0.174147,-0.068679,-0.427419,0.275710,-0.265282,...,0.106734,-0.394640,-0.545206,-0.018827,-0.503038,0.032426,-0.124075,0.419275,0.599457,0.703059
ZNF41.10003.15,0.027489,-0.383724,0.000000,-0.266019,0.275078,-0.257966,-0.056785,0.273533,-0.099437,0.355600,...,-0.071350,0.399248,0.486806,-0.015190,0.416579,-0.223917,-0.058918,-0.458048,-0.555093,-0.446234
ELK1.10006.25,-0.235501,0.248620,-0.266019,0.000000,-0.085530,0.165881,-0.162034,-0.311595,0.119684,-0.048118,...,-0.024845,-0.050337,-0.308177,0.162908,-0.237737,-0.221936,-0.199070,0.106401,0.234039,0.346683
GUCA1A.10008.43,0.046085,-0.067600,0.275078,-0.085530,0.000000,0.010522,-0.164781,0.053943,-0.138618,0.178047,...,0.153743,0.051060,0.175415,-0.009028,0.154967,-0.153805,-0.041773,-0.162788,-0.175423,-0.103800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZNF264.9993.11,0.500784,0.032426,-0.223917,-0.221936,-0.153805,0.023220,0.659544,0.280024,0.272968,-0.481509,...,-0.217277,-0.168957,0.111078,-0.316474,-0.074897,0.000000,0.558281,0.422424,0.185639,-0.077577
ATP4B.9994.217,0.420562,-0.124075,-0.058918,-0.199070,-0.041773,-0.041302,0.518268,0.293094,0.210378,-0.270000,...,-0.177042,-0.070631,0.258732,-0.341784,0.106641,0.558281,0.000000,0.235933,-0.009255,-0.177337
DUT.9995.6,0.148778,0.419275,-0.458048,0.106401,-0.162788,0.156828,0.296497,-0.160761,0.254027,-0.543527,...,-0.126490,-0.520927,-0.383055,-0.330230,-0.506727,0.422424,0.235933,0.000000,0.597618,0.476068
UBXN4.9997.12,-0.112835,0.599457,-0.555093,0.234039,-0.175423,0.253196,0.021621,-0.257940,0.188106,-0.445151,...,0.033110,-0.474462,-0.578429,0.004791,-0.563239,0.185639,-0.009255,0.597618,0.000000,0.719375


In [9]:
a2_df = a2_corr[a2_features]
a2_df

Unnamed: 0,BRD4.10043.31,CBS.10086.39,ZNRF3.10390.21,PSMB6.10530.8,CHI3L1.11104.13,CCDC64.11158.40,NAGPA.11208.15,TNR.11302.237,KRT7.11383.41,SETMAR.12462.20,...,ICAM5.8245.27,DAG1.8369.102,CDHR1.8372.29,SMIM9.8888.33,PRRG1.9008.6,ABO.9253.52,CBLN1.9313.27,ERP27.9333.59,VCAN.9561.21,GABBR2.9930.48
CRYBB2.10000.28,0.450145,-0.197322,-0.256792,0.057104,0.081933,0.148982,-0.312043,-0.228151,0.165370,0.323686,...,-0.068506,0.074627,0.371413,-0.281670,0.152472,-0.065308,-0.157831,0.009170,-0.129955,-0.098689
RAF1.10001.7,-0.093296,-0.057082,0.345815,-0.443929,-0.052914,-0.167480,0.010753,-0.247889,-0.245448,-0.224918,...,-0.029788,-0.381152,-0.451867,0.145161,0.215053,-0.126695,-0.155573,-0.375246,-0.259067,-0.105061
ZNF41.10003.15,0.101274,0.022939,-0.038067,0.422275,-0.097084,0.001981,-0.047359,0.236742,-0.050693,0.094251,...,0.040291,0.201023,0.311824,-0.144342,-0.151419,0.057531,0.110436,0.345429,0.265842,0.163235
ELK1.10006.25,-0.279575,0.068063,0.191470,-0.245021,0.016970,-0.100326,0.145648,-0.052439,-0.154225,-0.320309,...,0.017980,-0.185290,-0.313109,0.220988,-0.039668,0.081128,0.101834,-0.116707,-0.027577,-0.023370
GUCA1A.10008.43,0.005973,0.038360,0.062450,0.058964,-0.052009,-0.004887,0.123806,0.101164,-0.070769,0.079076,...,0.065988,0.098640,0.067024,-0.042159,-0.158295,-0.024043,0.123529,0.146041,0.193218,0.075348
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZNF264.9993.11,0.511286,-0.267768,-0.270404,-0.126782,0.115463,0.154592,-0.363385,-0.395911,0.203124,0.439327,...,-0.102134,0.018817,0.280996,-0.294658,0.250871,-0.104608,-0.314377,-0.193168,-0.399383,-0.268567
ATP4B.9994.217,0.506659,-0.161506,-0.271832,-0.031255,0.053794,0.223913,-0.384123,-0.289189,0.186657,0.386692,...,-0.198938,0.055749,0.387162,-0.183482,0.217124,-0.078665,-0.286008,-0.046908,-0.186727,-0.204943
DUT.9995.6,0.221336,-0.191584,0.026943,-0.300207,0.150489,0.077478,-0.337872,-0.469700,0.282132,-0.031120,...,-0.195358,-0.304045,-0.167019,-0.122039,0.245992,-0.193583,-0.345385,-0.414855,-0.568947,-0.311732
UBXN4.9997.12,-0.064222,-0.095518,0.164711,-0.431910,0.061877,-0.050502,-0.075739,-0.296618,-0.014810,-0.189544,...,-0.001819,-0.276297,-0.424643,0.151342,0.227430,-0.120384,-0.165575,-0.375788,-0.440869,-0.149391


In [10]:
l = []

from prettytable import PrettyTable
t = PrettyTable(['Model Feature', 'Number of correlated proteins', 'Correlated proteins'])

for feature in a2_features:
    corrs = a2_df[feature]

    corr_prots = corrs[(corrs < -0.8) | (corrs > 0.8)]
    #print(feature, "\t", len(corr_prots), "\t", corr_prots)
    # l.append()
    t.add_row([feature, len(corr_prots), corr_prots])
print(t)

+------------------+-------------------------------+----------------------------------------------------+
|  Model Feature   | Number of correlated proteins |                Correlated proteins                 |
+------------------+-------------------------------+----------------------------------------------------+
|  BRD4.10043.31   |               0               |  Series([], Name: BRD4.10043.31, dtype: float64)   |
|   CBS.10086.39   |               0               |   Series([], Name: CBS.10086.39, dtype: float64)   |
|  ZNRF3.10390.21  |               0               |  Series([], Name: ZNRF3.10390.21, dtype: float64)  |
|  PSMB6.10530.8   |               0               |  Series([], Name: PSMB6.10530.8, dtype: float64)   |
| CHI3L1.11104.13  |               0               | Series([], Name: CHI3L1.11104.13, dtype: float64)  |
| CCDC64.11158.40  |               0               | Series([], Name: CCDC64.11158.40, dtype: float64)  |
|  NAGPA.11208.15  |               0          

### Infe A3

In [6]:
outcome = 'A3'

In [7]:
# get model proteins

a3_train_features_file = f'{FINAL_MODEL_DIR}/all_proteins-soma_data=normalized-nat_log_transf=True-standardize=True_infe_{outcome}_coef.pkl'
with open(a3_train_features_file, "rb") as fp:  # Unpickling
    a3_train_features = pickle.load(fp)

a3_features = [f for f in a3_train_features if
                             f not in ['age_at_diagnosis', 'sex_M', 'ProcessTime', 'SampleGroup']]

print(a3_features)
print(len(a3_features))

['MCL1.10396.6', 'PLOD3.10612.18', 'GCH1.11185.145', 'XDH.11264.33', 'TEAD4.12516.13', 'ILF3.12759.47', 'ZNF334.12763.69', 'SF1.12777.11', 'BAG4.12844.10', 'PVRL3.13557.3', 'RBL2.13565.2', 'NADK.13624.17', 'CSF1R.13682.47', 'FARS2.13941.82', 'AGAP3.13960.15', 'MAGI2.14066.49', 'PSMA4.14099.20', 'PNOC.15434.5', 'FBLN5.15585.304', 'AKR1C3.17377.1', 'MRAS.18297.8', 'AS3MT.18417.3', 'BLNK.19225.11', 'PFDN2.19243.2', 'SFTPD.19590.46', 'DCN.2666.53', 'CDH5.2819.23', 'PGF.3078.1', 'BSG.3585.54', 'CSF1.3738.54', 'TEK.3773.15', 'PFDN5.4271.75', 'GDF15.4374.45', 'IL1A.4851.25', 'CCL7.4886.3', 'CAPG.4968.50', 'MAPK12.5005.4', 'SIRT2.5030.52', 'NPPA.5443.62', 'SEMA3G.5628.21', 'MFAP4.5636.10', 'TMX3.5654.70', 'GALNT3.6593.5', 'NTN1.6649.51', 'HBZ.6919.3', 'STX3.7186.111', 'LRFN2.7200.4', 'APLP1.7210.25', 'ANKRD46.7851.30', 'MINOS1.7956.11', 'TMEM70.8074.32', 'RSPO3.8427.118', 'GPNMB.8606.39', 'IL21R.9366.54', 'GALP.9398.30', 'LST1.9531.24', 'PRR27.9607.39', 'ACTN2.9844.138']
58


In [13]:
# get correlations
start_time = time.time()

a3_corr = prot_df.corr(method='spearman')

print("--- %s seconds ---" % (time.time() - start_time))

--- 79.62524271011353 seconds ---


In [14]:
for i in range(len(a3_corr)):
    a3_corr.iloc[i, i] = 0  # set correlations for same protein to 0 (i.e. on diagonal)
a3_corr

Unnamed: 0,CRYBB2.10000.28,RAF1.10001.7,ZNF41.10003.15,ELK1.10006.25,GUCA1A.10008.43,OCRL.10011.65,SPDEF.10012.5,SNAI2.10014.31,KCNAB2.10015.119,POLH.10022.207,...,KIAA1467.9981.18,PRSS35.9983.97,YIPF6.9984.12,NPW.9986.14,LRRC24.9989.12,ZNF264.9993.11,ATP4B.9994.217,DUT.9995.6,UBXN4.9997.12,IRF6.9999.1
CRYBB2.10000.28,0.000000,-0.144330,0.027489,-0.235501,0.046085,-0.102753,0.449100,0.281074,0.213449,-0.227891,...,-0.172170,-0.038048,0.267035,-0.244359,0.083656,0.500784,0.420562,0.148778,-0.112835,-0.224992
RAF1.10001.7,-0.144330,0.000000,-0.383724,0.248620,-0.067600,0.174147,-0.068679,-0.427419,0.275710,-0.265282,...,0.106734,-0.394640,-0.545206,-0.018827,-0.503038,0.032426,-0.124075,0.419275,0.599457,0.703059
ZNF41.10003.15,0.027489,-0.383724,0.000000,-0.266019,0.275078,-0.257966,-0.056785,0.273533,-0.099437,0.355600,...,-0.071350,0.399248,0.486806,-0.015190,0.416579,-0.223917,-0.058918,-0.458048,-0.555093,-0.446234
ELK1.10006.25,-0.235501,0.248620,-0.266019,0.000000,-0.085530,0.165881,-0.162034,-0.311595,0.119684,-0.048118,...,-0.024845,-0.050337,-0.308177,0.162908,-0.237737,-0.221936,-0.199070,0.106401,0.234039,0.346683
GUCA1A.10008.43,0.046085,-0.067600,0.275078,-0.085530,0.000000,0.010522,-0.164781,0.053943,-0.138618,0.178047,...,0.153743,0.051060,0.175415,-0.009028,0.154967,-0.153805,-0.041773,-0.162788,-0.175423,-0.103800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZNF264.9993.11,0.500784,0.032426,-0.223917,-0.221936,-0.153805,0.023220,0.659544,0.280024,0.272968,-0.481509,...,-0.217277,-0.168957,0.111078,-0.316474,-0.074897,0.000000,0.558281,0.422424,0.185639,-0.077577
ATP4B.9994.217,0.420562,-0.124075,-0.058918,-0.199070,-0.041773,-0.041302,0.518268,0.293094,0.210378,-0.270000,...,-0.177042,-0.070631,0.258732,-0.341784,0.106641,0.558281,0.000000,0.235933,-0.009255,-0.177337
DUT.9995.6,0.148778,0.419275,-0.458048,0.106401,-0.162788,0.156828,0.296497,-0.160761,0.254027,-0.543527,...,-0.126490,-0.520927,-0.383055,-0.330230,-0.506727,0.422424,0.235933,0.000000,0.597618,0.476068
UBXN4.9997.12,-0.112835,0.599457,-0.555093,0.234039,-0.175423,0.253196,0.021621,-0.257940,0.188106,-0.445151,...,0.033110,-0.474462,-0.578429,0.004791,-0.563239,0.185639,-0.009255,0.597618,0.000000,0.719375


In [16]:
a3_df = a3_corr[a3_features]
a3_df

Unnamed: 0,MCL1.10396.6,PLOD3.10612.18,GCH1.11185.145,XDH.11264.33,TEAD4.12516.13,ILF3.12759.47,ZNF334.12763.69,SF1.12777.11,BAG4.12844.10,PVRL3.13557.3,...,ANKRD46.7851.30,MINOS1.7956.11,TMEM70.8074.32,RSPO3.8427.118,GPNMB.8606.39,IL21R.9366.54,GALP.9398.30,LST1.9531.24,PRR27.9607.39,ACTN2.9844.138
CRYBB2.10000.28,0.049073,0.203051,0.216453,-0.032989,0.167429,0.178656,-0.262306,0.374248,0.435387,0.175655,...,0.378554,-0.017460,0.027702,0.074668,-0.291183,0.254011,-0.057346,0.390110,-0.178025,0.057444
RAF1.10001.7,-0.072521,-0.059097,-0.256584,-0.265255,-0.333506,0.100415,-0.472912,0.191661,-0.137879,-0.322086,...,-0.309057,0.034259,-0.195446,0.004244,-0.110614,0.079631,-0.347667,0.110929,0.024521,0.019918
ZNF41.10003.15,-0.279690,-0.114025,-0.017168,0.256952,0.004932,-0.296128,0.409003,-0.262740,0.064436,0.337075,...,0.168092,-0.059200,0.357445,-0.167369,-0.065462,-0.298849,0.351484,-0.339177,0.187243,-0.270857
ELK1.10006.25,0.119350,-0.188220,-0.173855,-0.082743,-0.109929,0.052000,0.031872,-0.126669,-0.319488,-0.388065,...,-0.348173,-0.081074,-0.119905,-0.038940,0.115524,-0.014192,-0.130891,-0.126289,0.013177,-0.016042
GUCA1A.10008.43,-0.146477,-0.071547,-0.121150,0.025834,-0.017037,-0.058169,0.154704,0.011591,-0.022658,0.029562,...,0.071333,0.041651,0.080844,-0.020835,0.020482,-0.121269,0.109324,-0.244631,0.182972,-0.082501
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZNF264.9993.11,0.212133,0.277791,0.267213,-0.148727,0.175433,0.373228,-0.557151,0.525371,0.549168,0.164418,...,0.457604,-0.000051,-0.032025,0.210405,-0.398643,0.501335,-0.198519,0.733859,-0.284549,0.241776
ATP4B.9994.217,0.096734,0.281349,0.284832,-0.023219,0.188925,0.169659,-0.320816,0.402691,0.550288,0.164161,...,0.471676,0.103109,-0.032657,0.087068,-0.354379,0.345150,-0.081342,0.526848,-0.323544,0.098792
DUT.9995.6,0.459127,0.335596,0.294328,-0.381205,0.120840,0.601438,-0.738899,0.510025,0.200432,-0.327281,...,0.057885,-0.039375,-0.224797,0.093768,-0.359144,0.695700,-0.468790,0.661597,-0.282444,0.374342
UBXN4.9997.12,0.297149,0.110262,-0.040454,-0.324797,-0.069694,0.307499,-0.571674,0.263960,-0.086133,-0.428367,...,-0.215978,-0.080851,-0.279710,0.121187,-0.123834,0.290065,-0.496485,0.408199,-0.139069,0.259166


In [17]:
l = []

from prettytable import PrettyTable
t = PrettyTable(['Model Feature', 'Number of correlated proteins', 'Correlated proteins'])

for feature in a3_features:
    corrs = a3_df[feature]

    corr_prots = corrs[(corrs < -0.8) | (corrs > 0.8)]
    #print(feature, "\t", len(corr_prots), "\t", corr_prots)
    # l.append()
    t.add_row([feature, len(corr_prots), corr_prots])
print(t)

+-----------------+-------------------------------+---------------------------------------------------+
|  Model Feature  | Number of correlated proteins |                Correlated proteins                |
+-----------------+-------------------------------+---------------------------------------------------+
|   MCL1.10396.6  |               0               |   Series([], Name: MCL1.10396.6, dtype: float64)  |
|  PLOD3.10612.18 |               0               |  Series([], Name: PLOD3.10612.18, dtype: float64) |
|  GCH1.11185.145 |               0               |  Series([], Name: GCH1.11185.145, dtype: float64) |
|   XDH.11264.33  |               0               |   Series([], Name: XDH.11264.33, dtype: float64)  |
|  TEAD4.12516.13 |               0               |  Series([], Name: TEAD4.12516.13, dtype: float64) |
|  ILF3.12759.47  |               0               |  Series([], Name: ILF3.12759.47, dtype: float64)  |
| ZNF334.12763.69 |               1               |             

In [69]:
l = [("Model Feature Proteins", "Correlated Proteins", "Spearman r > 0.8, r < -0.8")]

for feature in a3_features:
    corrs = a3_df[feature]
    corr_prots = corrs[(corrs < -0.8) | (corrs > 0.8)]
   
    if len(corr_prots) > 0:
        for i in range(len(corr_prots)):
            item = (feature, corr_prots.index[i], corr_prots.values[i])
            l.append(item)
print(l)

[('Model Feature Proteins', 'Correlated Proteins', 'Spearman r > 0.8, r < -0.8'), ('ZNF334.12763.69', 'ISL1.11549.6', 0.826927887316184), ('BAG4.12844.10', 'PLA2G5.2449.1', 0.853453169284582), ('PFDN2.19243.2', 'CILP.5717.2', 0.8010751980829495), ('DCN.2666.53', 'EFNB3.2514.65', 0.8212387215960145), ('DCN.2666.53', 'PRKCZ.2645.54', 0.8214077298370384), ('DCN.2666.53', 'SOD1.2794.60', 0.8500562960888802), ('DCN.2666.53', 'MMP17.2838.53', 0.8090586758525531), ('DCN.2666.53', 'MFRP.3685.53', 0.8185378318400062), ('DCN.2666.53', 'CELF2.7245.2', 0.8404147151080439), ('CDH5.2819.23', 'EPHA4.16288.17', 0.8118570165339632), ('BSG.3585.54', 'CTSG.2431.17', 0.8232002247706051), ('BSG.3585.54', 'GFRA3.2505.49', 0.8078057553153299), ('BSG.3585.54', 'EFNB3.2514.65', 0.8510849977404206), ('BSG.3585.54', 'TNFRSF8.2605.49', 0.8099778237209915), ('BSG.3585.54', 'PRKCZ.2645.54', 0.8232230479744691), ('BSG.3585.54', 'PECAM1.2695.25', 0.8233805497164888), ('BSG.3585.54', 'SOD1.2794.60', 0.8217394088923079

### Write results to csv file 

In [70]:
file_name = os.path.join(ROOT_DIR, 'results', 'models', 'infe_a3_lasso_protein_correlations.csv')

with open(file_name, 'w') as f:
    writer = csv.writer(f , lineterminator='\n')
    for tup in l:
        writer.writerow(tup)


### Check all protein correlations

In [72]:
# Here are some plot styles, which primarily make this plot larger for display
# purposes.
import matplotlib.pyplot as plt
font = 30 
parameters = {'axes.labelsize': font*2,
              'legend.fontsize': font*2,
              'xtick.labelsize':font,
              'ytick.labelsize':font,
          'axes.titlesize': font*2}
plt.rcParams.update(parameters)

In [None]:
# All samples
import seaborn as sns
import time

figure_size = (50, 42)

# start_time = time.time()

# plt.subplots(figsize=figure_size)

# df = data[proteins]
# corr = df.corr(method='spearman').abs()
# sns.heatmap(corr, cmap='Blues')

# plt.title(f"{outcome} - All Samples (All proteins)")
# plt.ylabel(r'$\leftarrow$ Increasing p value')
# plt.xlabel(r'Increasing p value $\rightarrow $')
# plt.savefig(f'{outcome}all_samples_spearman_correlation.png')  # where i is also the order of p-values from smallest p to largest
# plt.close()

# print("--- %s seconds ---" % (time.time() - start_time))


# # Cases
# start_time = time.time()


# plt.subplots(figsize=figure_size)

# df = cases[proteins]
# corr = df.corr(method='spearman').abs()
# sns.heatmap(corr, cmap='Blues')  

# plt.title(f"{outcome} - Cases (All proteins)")
# plt.ylabel(r'$\leftarrow$ Increasing p value')
# plt.xlabel(r'Increasing p value $\rightarrow $')
# plt.savefig(f'{outcome}cases_spearman_correlation.png')  # where i is also the order of p-values from smallest p to largest
# plt.close()
# print("--- %s seconds ---" % (time.time() - start_time))


# Controls
start_time = time.time()

plt.subplots(figsize=figure_size)

df = controls[proteins]
corr = df.corr(method='spearman').abs()
sns.heatmap(corr, cmap='Blues')

plt.title(f"{outcome} - Controls (All proteins)")
plt.ylabel(r'$\leftarrow$ Increasing p value')
plt.xlabel(r'Increasing p value $\rightarrow $')
plt.savefig(f'{outcome}controls_spearman_correlation.png')  # where i is also the order of p-values from smallest p to largest
plt.close()

print("--- %s seconds ---" % (time.time() - start_time))

### Save Spearman correlation matrix for Infe A2 lasso model and Infe A3 lasso mdoel proteins

In [19]:
# A2 
a2_prot_df = df[a2_features]
file = os.path.join(ROOT_DIR, 'results', 'datasets', 'infe_A2_lasso_proteins_raw.csv')
a2_prot_df.to_csv(file)
a2_prot_df

Unnamed: 0,BRD4.10043.31,CBS.10086.39,ZNRF3.10390.21,PSMB6.10530.8,CHI3L1.11104.13,CCDC64.11158.40,NAGPA.11208.15,TNR.11302.237,KRT7.11383.41,SETMAR.12462.20,...,ICAM5.8245.27,DAG1.8369.102,CDHR1.8372.29,SMIM9.8888.33,PRRG1.9008.6,ABO.9253.52,CBLN1.9313.27,ERP27.9333.59,VCAN.9561.21,GABBR2.9930.48
0,714.2,1266.5,268.5,293.2,15144.1,1355.6,4290.0,923.9,2809.8,644.8,...,2221.9,450.5,751.6,502.2,1141.4,358.6,1056.0,246.2,184.4,217.2
1,597.6,1229.1,1021.0,270.8,2644.8,819.6,2420.3,997.8,1001.6,558.5,...,2137.6,307.4,560.4,538.3,1442.3,2550.1,949.5,251.6,216.6,660.7
2,652.7,1723.4,528.7,355.0,22017.6,1243.0,4280.7,895.2,1042.1,636.9,...,1743.5,324.0,681.7,572.3,1313.0,3961.8,701.5,278.9,184.5,259.0
3,521.5,1640.1,406.4,298.7,7157.7,890.4,7670.9,1083.0,718.8,492.7,...,2081.0,363.7,620.1,681.6,1242.9,413.3,1447.9,222.8,215.2,285.9
4,694.0,1242.1,467.1,260.3,13614.7,1846.6,2908.3,984.8,1803.6,608.1,...,1308.4,373.7,702.7,524.3,1388.8,5062.2,528.4,556.3,191.7,241.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
412,492.2,1599.5,413.6,392.9,5535.8,797.5,5159.9,1400.9,1252.8,543.9,...,1651.9,462.0,714.9,508.9,1145.2,582.7,1127.1,303.8,253.3,357.6
413,508.8,2044.7,363.1,338.0,50123.6,1700.0,6238.1,1337.8,775.3,562.4,...,1928.3,521.8,724.9,686.2,1200.5,504.9,1865.5,262.2,252.5,312.1
414,557.8,1453.1,307.3,268.5,19621.5,806.6,4994.6,1186.6,1062.0,553.9,...,1549.3,476.6,711.2,588.1,1141.5,24129.6,1245.4,303.1,237.0,450.7
415,466.5,1535.7,418.0,289.9,52993.1,962.5,6298.9,1566.6,931.1,518.5,...,2420.2,535.1,644.6,757.9,1058.4,21633.1,1935.0,297.0,220.2,88106.6


In [14]:
a2_prot_corr = a2_prot_df.corr(method='spearman')
a2_prot_corr

Unnamed: 0,BRD4.10043.31,CBS.10086.39,ZNRF3.10390.21,PSMB6.10530.8,CHI3L1.11104.13,CCDC64.11158.40,NAGPA.11208.15,TNR.11302.237,KRT7.11383.41,SETMAR.12462.20,...,ICAM5.8245.27,DAG1.8369.102,CDHR1.8372.29,SMIM9.8888.33,PRRG1.9008.6,ABO.9253.52,CBLN1.9313.27,ERP27.9333.59,VCAN.9561.21,GABBR2.9930.48
BRD4.10043.31,1.000000,-0.302109,-0.215776,0.157335,0.152538,0.245350,-0.552769,-0.367055,0.178869,0.495273,...,-0.075383,0.166285,0.554591,-0.354917,0.162097,-0.154109,-0.278665,0.031950,-0.216599,-0.218221
CBS.10086.39,-0.302109,1.000000,0.053841,-0.050949,-0.291743,-0.109091,0.361959,0.272999,-0.157978,-0.230708,...,-0.132131,-0.165302,-0.155022,0.240750,-0.063069,0.116469,0.075377,0.034428,0.212279,0.209584
ZNRF3.10390.21,-0.215776,0.053841,1.000000,-0.086370,0.006614,-0.296971,0.032943,0.012617,-0.194461,-0.281828,...,-0.040147,-0.269832,-0.291872,0.247702,0.012338,-0.017670,0.004765,-0.169838,0.017860,0.077774
PSMB6.10530.8,0.157335,-0.050949,-0.086370,1.000000,0.038406,0.048889,-0.138733,0.091868,0.055728,0.089474,...,0.015835,0.234922,0.325310,-0.186882,-0.062565,0.003555,0.034840,0.341061,0.131619,0.153034
CHI3L1.11104.13,0.152538,-0.291743,0.006614,0.038406,1.000000,0.170692,-0.306009,-0.198248,0.260636,0.017147,...,0.080882,0.188101,0.105711,-0.158138,-0.035912,-0.062920,-0.091128,0.009954,-0.095209,-0.122266
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ABO.9253.52,-0.154109,0.116469,-0.017670,0.003555,-0.062920,-0.127999,0.130270,0.193759,-0.084631,-0.145346,...,0.008950,0.011622,-0.037289,-0.015803,-0.014243,1.000000,0.145791,0.118201,0.116463,0.168734
CBLN1.9313.27,-0.278665,0.075377,0.004765,0.034840,-0.091128,-0.104992,0.326277,0.364149,-0.152482,-0.116248,...,0.302979,0.147550,-0.055548,0.099342,-0.179223,0.145791,1.000000,0.129346,0.294477,0.050586
ERP27.9333.59,0.031950,0.034428,-0.169838,0.341061,0.009954,0.079131,0.010984,0.183828,0.004933,0.162209,...,0.082017,0.295850,0.215704,-0.064747,-0.155289,0.118201,0.129346,1.000000,0.235412,0.119252
VCAN.9561.21,-0.216599,0.212279,0.017860,0.131619,-0.095209,-0.055203,0.243364,0.404178,-0.127617,-0.021721,...,0.106701,0.151983,0.076061,0.179958,-0.242044,0.116463,0.294477,0.235412,1.000000,0.218894


In [16]:
file = os.path.join(ROOT_DIR, 'results', 'datasets', 'infe_A2_lasso_proteins_spearman.csv')
a2_prot_corr.to_csv(file)

In [20]:
# A3
a3_prot_df = df[a3_features]
file = os.path.join(ROOT_DIR, 'results', 'datasets', 'infe_A3_lasso_proteins_raw.csv')
a3_prot_df.to_csv(file)
a3_prot_df

Unnamed: 0,MCL1.10396.6,PLOD3.10612.18,GCH1.11185.145,XDH.11264.33,TEAD4.12516.13,ILF3.12759.47,ZNF334.12763.69,SF1.12777.11,BAG4.12844.10,PVRL3.13557.3,...,ANKRD46.7851.30,MINOS1.7956.11,TMEM70.8074.32,RSPO3.8427.118,GPNMB.8606.39,IL21R.9366.54,GALP.9398.30,LST1.9531.24,PRR27.9607.39,ACTN2.9844.138
0,702.2,2649.3,1027.5,133.7,616.4,1074.9,1287.1,747.3,625.6,546.3,...,994.7,333.4,202.6,1351.5,426.0,2332.9,359.0,7394.9,680.9,1122.0
1,398.7,2892.6,765.5,159.5,499.3,616.1,938.5,1079.3,622.8,530.0,...,860.6,846.7,307.6,347.0,330.3,2016.4,377.9,4240.7,730.8,2907.6
2,332.4,6023.8,1602.4,127.9,511.1,683.0,1020.8,985.8,549.8,592.7,...,1002.1,2269.0,194.6,452.1,300.4,2445.6,404.1,8799.8,656.9,912.9
3,212.5,2118.6,588.9,154.5,441.2,488.0,1919.7,626.2,482.3,568.0,...,928.5,2868.9,165.6,523.5,416.2,1732.7,433.5,3554.1,840.7,7941.0
4,275.3,4222.9,808.3,133.2,483.2,541.3,1128.2,686.1,515.5,609.9,...,1083.4,3013.1,585.3,522.7,279.1,1932.2,397.5,9051.0,670.4,3115.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
412,339.1,2014.6,841.0,159.6,553.7,485.8,2613.9,509.6,419.8,1599.2,...,1065.6,360.4,231.3,492.4,552.2,1440.4,492.4,1195.6,848.9,1417.6
413,209.1,2007.1,751.5,179.8,560.6,473.7,2910.9,534.9,459.3,607.2,...,863.5,6724.7,273.5,750.1,692.2,1043.2,529.0,2276.6,757.0,3286.7
414,263.8,3025.1,722.9,165.7,489.5,440.7,1773.4,563.5,509.7,651.9,...,943.3,446.8,187.6,785.2,595.3,1296.8,445.2,3444.4,700.4,768.1
415,375.4,901.2,672.9,225.2,514.7,566.2,2216.7,609.2,370.9,770.0,...,814.4,2572.2,227.4,605.7,820.0,1376.9,476.3,1357.2,858.9,2489.4


In [15]:
a3_prot_corr = a3_prot_df.corr(method='spearman')
a3_prot_corr

Unnamed: 0,MCL1.10396.6,PLOD3.10612.18,GCH1.11185.145,XDH.11264.33,TEAD4.12516.13,ILF3.12759.47,ZNF334.12763.69,SF1.12777.11,BAG4.12844.10,PVRL3.13557.3,...,ANKRD46.7851.30,MINOS1.7956.11,TMEM70.8074.32,RSPO3.8427.118,GPNMB.8606.39,IL21R.9366.54,GALP.9398.30,LST1.9531.24,PRR27.9607.39,ACTN2.9844.138
MCL1.10396.6,1.0,0.209211,0.343886,-0.134721,0.398498,0.568397,-0.295168,0.303896,0.104871,-0.214052,...,0.091287,-0.097029,-0.1053,0.176569,0.022998,0.492259,-0.257171,0.393851,-0.237571,0.373601
PLOD3.10612.18,0.209211,1.0,0.466961,-0.143288,0.130531,0.194602,-0.302044,0.306382,0.2483,-0.020816,...,0.322338,-0.015676,-0.079426,0.09399,-0.225242,0.385944,-0.111156,0.395815,-0.197909,0.215929
GCH1.11185.145,0.343886,0.466961,1.0,0.0245,0.277688,0.230378,-0.237345,0.209162,0.283208,0.111864,...,0.271151,0.113509,-0.122849,-0.061302,-0.213994,0.435253,0.000131,0.339583,-0.1997,0.169826
XDH.11264.33,-0.134721,-0.143288,0.0245,1.0,-0.048933,-0.232464,0.381528,-0.193946,-0.047792,0.265038,...,0.016985,0.138579,0.109683,-0.149465,0.118857,-0.227337,0.37762,-0.285494,0.002555,-0.146146
TEAD4.12516.13,0.398498,0.130531,0.277688,-0.048933,1.0,0.360387,-0.134233,0.289055,0.216089,0.079264,...,0.331751,-0.103976,0.077411,0.227308,0.048481,0.220553,-0.117399,0.208421,-0.130723,0.401752
ILF3.12759.47,0.568397,0.194602,0.230378,-0.232464,0.360387,1.0,-0.539656,0.534237,0.264708,-0.239763,...,0.113248,-0.123706,-0.056728,0.284492,-0.123351,0.643857,-0.408541,0.475941,-0.217927,0.374421
ZNF334.12763.69,-0.295168,-0.302044,-0.237345,0.381528,-0.134233,-0.539656,1.0,-0.56629,-0.298281,0.164534,...,-0.189854,0.103482,0.122298,-0.208627,0.382158,-0.589611,0.508803,-0.695378,0.261974,-0.370625
SF1.12777.11,0.303896,0.306382,0.209162,-0.193946,0.289055,0.534237,-0.56629,1.0,0.42654,-0.055608,...,0.341754,-0.030201,-0.046471,0.226995,-0.194813,0.526893,-0.307555,0.560648,-0.209112,0.316399
BAG4.12844.10,0.104871,0.2483,0.283208,-0.047792,0.216089,0.264708,-0.298281,0.42654,1.0,0.130983,...,0.531105,-0.04066,0.019697,0.113934,-0.391151,0.308398,-0.088688,0.550314,-0.429127,0.115809
PVRL3.13557.3,-0.214052,-0.020816,0.111864,0.265038,0.079264,-0.239763,0.164534,-0.055608,0.130983,1.0,...,0.325398,0.12865,0.176825,-0.1362,-0.082254,-0.159323,0.423617,-0.066237,0.136368,-0.212377


In [17]:
file = os.path.join(ROOT_DIR, 'results', 'datasets', 'infe_A3_lasso_proteins_spearman.csv')
a3_prot_corr.to_csv(file)