In [1]:
import pickle
import pandas as pd
from pathlib import Path
import networkx as nx
import py4cytoscape as p4c

In [None]:
def coexdir_to_edgeslist(coex_dir):
    '''
    create a function that opens an ATTED-II style gene coexpression directory,
    read all the contents temporarily through a python dataframe,
    filter only necessary edges (important to slim down the edge-list size)
    store the final content as an edge-list
    u-v has z weight: gene u and gene v is co-expressed with a value of z (z-score)
    '''
    edges = []

    for file in coex_dir.glob('*'):
        u = str(file.stem)
        # temporary df, rewritten every iteration of for loop
        df = pd.read_csv(file, sep='\t', header=None, names=['v', 'z'])
        df['v'] = df['v'].astype(str)

        # from each file, filter top K coexpressed genes & with a certain min Z score
        df = df.nlargest(K, 'z')
        df = df[df['z'] >= minZ]    

        # store to a list
        for v, z in zip(df['v'], df['z']):
            edges.append((u, v, float(z)))
    
    return edges

def build_graph(edges):
    '''
    initiate an empty undirected graph, then load the edges list
    use the correct method (add_weighted_edges_from)
    '''
    G = nx.Graph()
    # G.add_weighted_edges_from(edges)
    for u, v, z, in edges:
        if G.has_edge(u, v):
            if z > G[u][v]['weight']:
                G[u][v]['weight'] = z
        else:
            G.add_edge(u, v, weight=z)

    return G

def save_object(pyobj, pklobj):
    with open(pklobj, 'wb') as f:
        pickle.dump(pyobj, f)
    
    return None

def _main(coex_dir, output_dirname):
    edges = coexdir_to_edgeslist(coex_dir)
    G = build_graph(edges)

    # output_path = Path(output_dirname)
    # output_path.mkdir(parents=True, exist_ok=True)

    # save_object(G, output_path / 'sbi_G_prpF.pkl')
    # save_object(edges, output_path / 'sbi_edges.pkl')

sbi_coex_dir = Path('data/reference/sbi_coex')
K = 10
minZ = float(4)

In [None]:
if __name__ == '__main__':
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    default_output = Path(f'TAA_cluster-1st_attempt/{timestamp}')

    parser = argparse.ArgumentParser(description='Create coexpression network')
    parser.add_argument('--coex-dir', '-c', type=Path, default=sbi_coex_dir,
                        help='Path to ATTED-II style coexpression directory')
    parser.add_argument('--output-dir', '-o', type=Path, default=default_output,
                        help=f'Output directory for graph and edges pickle (default folder name is {default_output})')
    parser.add_argument('--gene-no', '-k', type=int, default=10,
                        help='From each file in the coex dir, filter up to K number of genes to be included in the network')
    parser.add_argument('--z-score', '-z', type=float, default=4,
                        help='From each file in the coex dir, filter only genes coexpressed with min z score')
    args = parser.parse_args()

    _main(args.coex_dir, args.output_dir)
    # _main(sbi_coex_dir, output_dir)

## preparing ready-to-search geneid from 01

In [2]:
RESULTS = Path("results/gene_network")

In [6]:
files = sorted(RESULTS.glob("map*-K*_kegg/map*-K*_kegg-01-homologous_geneID.txt"))

top_hits = []
for f in files:
    df = pd.read_csv(f, sep='\t')
    if df.empty:
        continue
    top = df.iloc[[0]].copy()
    # extract the KO tag from the parent folder name, e.g. "map00020-K00234_kegg"
    top.insert(0, "query", f.parent.name.replace("_kegg", ""))
    top_hits.append(top)

top_hits_df = pd.concat(top_hits, ignore_index=True)
top_hits_df


Unnamed: 0,query,protein_accession,gene_id,evalue,description
0,map00020-K00025,XP_002467079.1,8070482,1.4e-213,"malate dehydrogenase, cytoplasmic [Sorghum bic..."
1,map00020-K00026,XP_002443645.2,8065238,1.1e-190,"malate dehydrogenase, glyoxysomal [Sorghum bic..."
2,map00020-K00030,XP_002454117.1,8056475,1.6000000000000002e-213,isocitrate dehydrogenase [NAD] regulatory subu...
3,map00020-K00031,XP_021303439.1,110430300,1.5000000000000001e-268,isocitrate dehydrogenase [NADP] [Sorghum bicolor]
4,map00020-K00161,XP_002452579.1,8075734,1.1e-211,pyruvate dehydrogenase E1 component subunit al...
5,map00020-K00162,XP_021320770.1,110437034,1.5999999999999999e-201,pyruvate dehydrogenase E1 component subunit be...
6,map00020-K00164,XP_002446307.1,8070970,0.0,"2-oxoglutarate dehydrogenase, mitochondrial [S..."
7,map00020-K00234,XP_021309515.1,110432855,0.0,succinate dehydrogenase [ubiquinone] flavoprot...
8,map00020-K00235,XP_021320812.1,110437047,1.1000000000000001e-153,succinate dehydrogenase [ubiquinone] iron-sulf...
9,map00020-K00236,XP_002447454.2,8080288,1.1e-30,"succinate dehydrogenase subunit 3-1, mitochond..."
