In [22]:
from pathlib import Path
from pprint import pprint

import igraph as ig
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

pd.set_option("display.max_rows", 20)
pd.set_option("display.max_columns", 10)
pd.set_option("display.max_colwidth", 50)
pd.set_option("display.width", 110)

In [23]:
import warnings

warnings.simplefilter("ignore", RuntimeWarning)

In [24]:
# local paths
data_dir_src = Path('data/eda')
data_dir_dst = Path('data/wgcna')
data_dir_dst.mkdir(parents=True, exist_ok=True)

In [25]:
expression = pd.read_parquet(data_dir_src / 'expression.parquet')
print(expression.head())
print(expression.shape)

                 ARHGEF10L    HIF3A    RNF10    RNF11    RNF13  ...     PTRF   BCL6B    GSTK1    SELP  \
TCGA-69-7978-01     9.9898   4.2598  10.3657  11.1718  10.5897  ...  12.7565  8.2668  11.2400  6.1209   
TCGA-62-8399-01    10.4257  11.6239  11.5489  11.0200   9.2843  ...  12.2100  8.5437  10.3491  8.6398   
TCGA-78-7539-01     9.6264   9.1362  11.6692  10.4679  10.4649  ...  10.6498  6.1814  11.1659  6.0970   
TCGA-73-4658-01     9.2078   5.0288  11.6209  11.3414  10.9376  ...  13.0036  8.9786  10.6777  8.4187   
TCGA-44-6775-01    10.0039   4.0573  11.1721  11.0969  10.9337  ...  12.7727  7.5911  10.3340  7.3311   

                    SELS  
TCGA-69-7978-01   9.8977  
TCGA-62-8399-01   9.7315  
TCGA-78-7539-01  10.3540  
TCGA-73-4658-01  10.3142  
TCGA-44-6775-01  10.0039  

[5 rows x 16104 columns]
(506, 16104)


# correlation matrix

In [26]:
cor_file_path = data_dir_dst / 'correlation.parquet'
if not cor_file_path.exists():
    cor = np.corrcoef(expression.to_numpy(), rowvar=False)
    cor = pd.DataFrame(cor, columns=expression.columns, index=expression.columns)
    cor.to_parquet(cor_file_path, compression="snappy")
cor = pd.read_parquet(cor_file_path)
print(f'cor memory usage: {cor.memory_usage(deep=True).sum() / (1024**3):.2f} GB')
print(cor.shape)
print(cor.head())

cor memory usage: 1.93 GB
(16104, 16104)
           ARHGEF10L     HIF3A     RNF10     RNF11     RNF13  ...      PTRF     BCL6B     GSTK1      SELP  \
ARHGEF10L   1.000000  0.002286  0.019524 -0.101981 -0.105511  ...  0.116011  0.021985  0.286158  0.037598   
HIF3A       0.002286  1.000000 -0.003858 -0.053238 -0.042682  ... -0.031373  0.112904 -0.144835  0.193560   
RNF10       0.019524 -0.003858  1.000000 -0.180003 -0.232665  ... -0.073712 -0.057512 -0.081597 -0.073611   
RNF11      -0.101981 -0.053238 -0.180003  1.000000  0.367074  ...  0.154221  0.035936  0.050335  0.048495   
RNF13      -0.105511 -0.042682 -0.232665  0.367074  1.000000  ... -0.079411 -0.063368  0.192296  0.180943   

               SELS  
ARHGEF10L -0.311250  
HIF3A     -0.065258  
RNF10      0.049616  
RNF11     -0.008296  
RNF13      0.321258  

[5 rows x 16104 columns]


# correlation threshold

In [27]:
# get upper triangle indices (excluding diagonal)
upper_tri_index = np.triu_indices_from(cor, k=1)

# get abs(cor) values for the upper triangle
upper_tri_abs_values = np.abs(cor.to_numpy()[upper_tri_index])
print(upper_tri_abs_values.min())
print(upper_tri_abs_values.max())

3.3654175222728684e-09
0.9832068452445704


In [28]:
# compute the size of the upper triangle
N = len(upper_tri_abs_values)
assert N == ((cor.size - cor.shape[0]) / 2)

# compare thresholds
thresholds = np.arange(0, 1, 0.1)
for threshold in thresholds:
    n = (upper_tri_abs_values > threshold).sum()
    print(f'threshold: {threshold:.1f}, n: {int(n):,} ({n / N:.2%})')


threshold: 0.0, n: 129,661,356 (100.00%)
threshold: 0.1, n: 67,467,835 (52.03%)
threshold: 0.2, n: 27,413,830 (21.14%)
threshold: 0.3, n: 9,206,760 (7.10%)
threshold: 0.4, n: 2,717,343 (2.10%)
threshold: 0.5, n: 744,569 (0.57%)
threshold: 0.6, n: 208,863 (0.16%)
threshold: 0.7, n: 60,205 (0.05%)
threshold: 0.8, n: 12,731 (0.01%)
threshold: 0.9, n: 659 (0.00%)


# graph

In [38]:
threshold = 0.3
mask = upper_tri_abs_values > threshold
sources = upper_tri_index[0][mask] # row index
targets = upper_tri_index[1][mask] # col index
weights = upper_tri_abs_values[mask] # |cor| values

g = ig.Graph(
    n=cor.shape[0],
    edges=list(zip(sources, targets)),
    directed=False
)

g.es["weight"] = weights
g.vs["name"] = cor.index.tolist()

# check graph size
print(f'''
n nodes: {g.vcount():,}
n edges: {g.ecount():,}
''')


n nodes: 16,104
n edges: 9,206,760



# community detection

In [39]:
louvain = g.community_multilevel(weights="weight", resolution=1.0) # 4 communities, 8 coefs, 76.1
# louvain = g.community_multilevel(weights="weight", resolution=1.1) # 7 communites, 8 coefs, 76.0
# louvain = g.community_multilevel(weights="weight", resolution=1.3) # 14 communites, 10 coefs, 76.1
# louvain = g.community_multilevel(weights="weight", resolution=1.4) # 17 communites, 13 coefs, 76.3
# louvain = g.community_multilevel(weights="weight", resolution=1.5) # 19 communites, 12 coefs, 76.3
# louvain = g.community_multilevel(weights="weight", resolution=2.0) # 92 communities, 9 coefs, 76.0
louvain_communities = pd.Series(louvain.membership, index=expression.columns)

In [40]:
print(louvain_communities.nunique())

29


In [41]:
louvain_communities_size = louvain_communities.value_counts().sort_index()
print(louvain_communities_size)

0     3690
1     4471
2     4002
3     3874
4        1
      ... 
24       1
25       1
26       1
27       1
28       1
Name: count, Length: 29, dtype: int64


In [42]:
selected_communities = louvain_communities_size.index[louvain_communities_size > 10].to_list()
# selected_communities = louvain_communities_size.index.to_list()
selected_communities

[0, 1, 2, 3, 5, 8]

# eigengenes and loadings

In [43]:
eigengenes = {}
loadings = {}

for community in selected_communities: # community = selected_communities[0]
    #print(community)
    df_community = expression.loc[:, louvain_communities == community]
    pca = PCA(n_components=1)
    pc1 = pca.fit_transform(df_community.values).flatten()
    eigengenes[community] = pd.Series(pc1, index=expression.index)
    loadings[community] = pd.Series(pca.components_[0], index=df_community.columns)

In [44]:
eigengenes = pd.DataFrame(eigengenes, index=expression.index)
eigengenes.columns = [f'eigengene_{x+1}' for x in eigengenes.columns]
eigengenes.head()

Unnamed: 0,eigengene_1,eigengene_2,eigengene_3,eigengene_4,eigengene_6,eigengene_9
TCGA-69-7978-01,-1.679827,-28.597754,24.458553,49.222194,0.015382,2.912907
TCGA-62-8399-01,8.116884,-8.755893,22.903491,-13.438406,6.946744,-0.216088
TCGA-78-7539-01,-3.357579,12.129143,-5.964686,-10.713861,-1.599267,-8.709475
TCGA-73-4658-01,-38.307635,0.874504,-7.055459,41.814082,0.88182,1.63586
TCGA-44-6775-01,-4.370142,5.071119,21.828292,42.764854,-0.64339,-5.425188


In [45]:
loadings = pd.DataFrame(loadings)
loadings.columns = [f"eigengene_{x+1}" for x in loadings.columns]
loadings.head()

Unnamed: 0,eigengene_1,eigengene_2,eigengene_3,eigengene_4,eigengene_6,eigengene_9
?|100133144,0.022477,,,,,
?|100134869,0.019745,,,,,
?|10357,,-0.002395,,,,
?|10431,,,-0.015098,,,
?|155060,0.041203,,,,,


# save

In [46]:
eigengenes.to_parquet(data_dir_dst / 'eigengenes.parquet', compression='snappy')
loadings.to_parquet(data_dir_dst / 'loadings.parquet', compression='snappy')