# Network community detection

This notebook performs community detection approaches to identify network modules.

In [1]:
import os
import random

import numpy as np
import pandas as pd
import igraph as ig
from core_acc_modules import paths

In [2]:
# User params

# Choices = ["fastgreedy", "walktrap", "louvain", "infomap"]
method = "fastgreedy"

# TO DO: params for different methods to adjust
# steps for walktrap
# trails for infomap

In [3]:
# Load correlation matrix --> which correlation matrix to use?
pao1_pearson_mat_filename = os.path.join(paths.LOCAL_DATA_DIR, "pao1_pearson_mat.tsv")
pa14_pearson_mat_filename = os.path.join(paths.LOCAL_DATA_DIR, "pa14_pearson_mat.tsv")

# Take abs of correlation scores
# In this case we care about the strength and not the direction
pao1_corr = pd.read_csv(
    pao1_pearson_mat_filename, sep="\t", index_col=0, header=0
).abs()
pa14_corr = pd.read_csv(
    pa14_pearson_mat_filename, sep="\t", index_col=0, header=0
).abs()

In [4]:
pao1_corr.head()

Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA1905,PA0195,PA4812,PA0195.1,PA0457.1,PA1552.1,PA1555.1,PA3701,PA4724.1,PA5471.1
PA0001,1.0,0.463956,0.569484,0.351438,0.245083,0.082922,0.237419,0.327681,0.611781,0.053683,...,0.143054,0.067547,0.046884,0.115943,0.095939,0.137961,0.123254,0.413492,0.017787,0.044556
PA0002,0.463956,1.0,0.252376,0.74815,0.165336,0.064989,0.223772,0.474954,0.404363,0.091208,...,0.049508,0.029025,0.068886,0.066085,0.081798,0.364207,0.230212,0.295833,0.003284,0.026462
PA0003,0.569484,0.252376,1.0,0.370005,0.231063,0.30592,0.227813,0.226757,0.41443,0.044911,...,0.161805,0.105521,0.058363,0.032366,0.118532,0.204444,0.136097,0.330368,0.00047,0.053666
PA0004,0.351438,0.74815,0.370005,1.0,0.178458,0.013829,0.289639,0.519269,0.40631,0.03908,...,0.141347,0.146768,0.188109,0.079627,0.004548,0.385017,0.265849,0.248906,0.068307,0.074792
PA0005,0.245083,0.165336,0.231063,0.178458,1.0,0.494013,0.054159,0.548067,0.458342,0.600835,...,0.028263,0.156724,0.14744,0.28888,0.274439,0.158833,0.076657,0.376051,0.323167,0.055236


In [5]:
# Format correlation matrix into graph (i.e. dataframe with edge weight per pair of genes)
# The dataframe should have columns: from, to, weight
pao1_corr_graph = pao1_corr.stack().reset_index()
pao1_corr_graph.columns = ["from", "to", "weight"]

pa14_corr_graph = pa14_corr.stack().reset_index()
pa14_corr_graph.columns = ["from", "to", "weight"]

In [6]:
# Drop duplicate rows since correlation matrix is symmetric
pao1_corr_graph = pao1_corr_graph.drop_duplicates()
pa14_corr_graph = pa14_corr_graph.drop_duplicates()

In [7]:
# Drop gene loops
# Note 'query' not working for some reason
pao1_corr_graph = pao1_corr_graph[pao1_corr_graph["from"] != pao1_corr_graph["to"]]
pa14_corr_graph = pa14_corr_graph[pa14_corr_graph["from"] != pa14_corr_graph["to"]]

In [8]:
pao1_corr_graph.head()

Unnamed: 0,from,to,weight
1,PA0001,PA0002,0.463956
2,PA0001,PA0003,0.569484
3,PA0001,PA0004,0.351438
4,PA0001,PA0005,0.245083
5,PA0001,PA0006,0.082922


In [9]:
# Make into a graph object
pao1_G = ig.Graph.TupleList(pao1_corr_graph.values, weights=True, directed=False)
pa14_G = ig.Graph.TupleList(pa14_corr_graph.values, weights=True, directed=False)

In [10]:
# make sure vertex/edge properties exist
print(pao1_G.es["weight"][:5])

[0.4639558317118624, 0.569483925217973, 0.35143796014186435, 0.2450826281625292, 0.0829218006019242]


In [11]:
# TO DO: Add label for core, accessory gene

## Community detection

### Fast-greedy
This algorithm starts from a completely unclustered set of nodes and iteratively adds communities such that the modularity (score maximizing within edges and minimizing between edges) is maximized until no additional improvement can be made.

**What is this simplification step doing?**
This is removing multiple edges and loops -- how???

In [12]:
if method == "fastgreedy":
    pao1_partition = pao1_G.simplify().community_fastgreedy(weights=pao1_G.es["weight"])
    pa14_partition = pao1_G.simplify().community_fastgreedy(weights=pa14_G.es["weight"])

KeyError: 'Attribute does not exist'

In [None]:
# Error at fast_community.c:553: fast-greedy community finding works only on graphs without multiple edges, Invalid value

### Walktrap
This algorithm performs random walks using a specified step size. Where densely connected areas occur, the random walk becomes “trapped” in local regions that then define communities


In [None]:
if method == "walktrap":
    pao1_partition = pao1_G.community_walktrap(weights=pao1_G.es["weight"])
    pa14_partition = pa14_G.community_walktrap(weights=pao1_G.es["weight"])

### Multilevel
This algorithm is similar to fastgreedy, but it merges communities to optimize modularity based upon only the neighboring communities as opposed to all communities. The algorithm terminates when only a single node is left, or when the improvement in modularity cannot result from the simple merge of two neighboring communities. (Louvain clustering)

In [None]:
if method == "louvain":
    pao1_partition = pao1_G.community_multilevel(
        weights=pao1_G.es["weight"], return_levels=False
    )
    pa14_partition = pa14_G.community_multilevel(
        weights=pao1_G.es["weight"], return_levels=False
    )

### Infomap
This algorithm uses the probability flow of information in random walks, which occurs more readily in groups of heavily connected nodes. Thus, information about network structure can be compressed in maps of modules (nodes where information travels quickly)


In [None]:
if method == "infomap":
    pao1_partition = pao1_G.community_infomap(edge_weights=pao1_G.es["weight"])
    pa14_partition = pa14_G.community_infomap(edge_weights=pao1_G.es["weight"])

## Get membership

In [None]:
# get dataframe mapping Pa genes to communities
def graph_partition_to_df(G, partition):
    clusters = []
    for label, vl in enumerate(partition):
        clusters += [(G.vs["name"][v], label, G.degree(v)) for v in vl]
    return pd.DataFrame(clusters, columns=["gene", "module id", "degree"])

In [None]:
pao1_membership_df = graph_partition_to_df(pao1_G, pao1_partition)
print(len(pao1_membership_df["module id"].unique()))
pao1_membership_df.sort_values(by="degree", ascending=False).head()

In [None]:
pa14_membership_df = graph_partition_to_df(pa14_G, pa14_partition)
print(len(pa14_membership_df["module id"].unique()))
pa14_membership_df.sort_values(by="degree", ascending=False).head()

In [None]:
# Save
# Save membership dataframe
pao1_membership_filename = os.path.join(
    paths.LOCAL_DATA_DIR, f"pao1_modules_{method}.tsv"
)
pa14_membership_filename = os.path.join(
    paths.LOCAL_DATA_DIR, f"pa14_modules_{method}.tsv"
)
pao1_membership_df.to_csv(pao1_membership_filename, sep="\t")
pa14_membership_df.to_csv(pa14_membership_filename, sep="\t")