## Run community detection experiments

Do this for Louvain algorithm.

And hopefully eADAGE & coexpression networks, if time...

In [1]:
import os

import numpy as np
import pandas as pd
import igraph as ig
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches 
import seaborn as sns

from sample_nodes import (
    sample_degree_matched,
    sort_by_degree,
)

np.random.seed(42)

In [2]:
# relevant file paths
data_dir = './data'
edge_list = os.path.join(data_dir, 'edge_list_processed_unsigned.csv')

# map of Pa gene names to generic/not generic status, generated by Alex
generic_gene_map = os.path.join('..', 'pseudomonas_analysis', 'annot_df.tsv')

# script parameters
NUM_NODE_SAMPLES = 100 # number of degree-matched node samples for permutation test

In [3]:
graph_df = pd.read_csv(edge_list)
graph_df.head()

Unnamed: 0,from,to,weight
0,PA0001,PA0002,0.578872
1,PA0001,PA0548,0.510323
2,PA0001,PA0963,0.561494
3,PA0001,PA3637,0.512365
4,PA0001,PA5554,0.554768


In [4]:
G = ig.Graph.TupleList(graph_df.values,
                       weights=True,
                       directed=False)

In [5]:
# make sure vertex/edge properties exist
print(G.es['weight'][:5])

[0.578871792667748, 0.510322953238081, 0.561494130101994, 0.512364523924246, 0.554767556277657]


In [6]:
annot_df = pd.read_csv(generic_gene_map, sep='\t', index_col=0)
annot_df.head()

Unnamed: 0,label
PA0001,0
PA0002,0
PA0003,0
PA0004,0
PA0005,0


In [7]:
is_generic = [int(annot_df.loc[v['name'], 'label']) for v in G.vs]
G.vs['is_generic'] = is_generic

In [8]:
# community detection using Louvain modularity optimization
partition = G.community_multilevel(weights=G.es['weight'], return_levels=False)
# plot?

In [9]:
# get dataframe mapping Pa genes to communities
def graph_partition_to_df(G, partition):
    clusters = []
    for label, vl in enumerate(partition):
        clusters += [(G.vs['name'][v],
                      label,
                      G.degree(v),
                      G.vs['is_generic'][v]) for v in vl]
    return pd.DataFrame(clusters,
                        columns=['gene', 'label', 'degree', 'is_generic'])

labels_df = graph_partition_to_df(G, partition)
print(len(labels_df.label.unique()))
labels_df.sort_values(by='degree', ascending=False).head()

174


Unnamed: 0,gene,label,degree,is_generic
24,PA4238,0,209,0
27,PA4245,0,201,0
28,PA4247,0,192,0
65,PA4252,0,181,0
25,PA4239,0,180,0
