# Get the first PC of each Connected Component
The networks were generated using the ClusterONE MR network notebook

In [47]:
import networkx as nx
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

### 1. Get the raw data used to generate the MR network

In [17]:
df = pd.read_csv("data.tsv", index_col=0, sep="\t")
df = df.loc[~(df==0).all(axis=1)] # remove all-zero rows
# Optionally - normalize the raw data
df = np.log2(df+1)
df = df / df.sum()

### 2 Generate a list of all CCs

In [41]:
# Load the the edges tables. The code was written to iterate over any file combinations
# If you have a third weights column, include the data.
G = nx.read_edgelist("clusterONE.edges.tsv", delimiter="\t", data=[("weight", float,)])
cc_list = [G.subgraph(c).copy() for c in nx.connected_components(G)]

### 3. Generate the Eigen ClusterONE

In [48]:
edf = pd.DataFrame()
cc_count = 1
for cc in cc_list:
    pca = PCA(n_components = 1)
    X = pca.fit_transform(df.loc[list(cc.nodes)].T)
    edf["cc"+str(cc_count)] = X.T[0]
    cc_count += 1
edf.index = df.columns
edf.to_csv("eigenClusterONE.csv")

### 4. Write the gene list of each connected component

In [56]:
# Write one row for feature followed by the CC name
with open("eigenClusterONE.tsv", 'w') as out:
    out.write("feature"+"\t"+"cc_name"+"\n")
    cc_count = 1
    for cc in cc_list:
        for node in list(cc_list[0].nodes):
            out.write(node+"\t"+"cc"+str(cc_count)+"\n")
        cc_count += 1

In [57]:
# Write one row for each CC followed by the gene list
with open("eigenClusterONE.tsv", 'w') as out:
    out.write("name"+"\t"+"name_list"+"\n")
    cc_count = 1
    for cc in cc_list:
        out.write("cc"+str(cc_count)+"\t"+",".join(list(cc_list[0].nodes))+"\n")
        cc_count += 1