In [1]:
#This script takes an individual gene as standard input (gene name and Entrez ID), and outputs the shortest distance to every other gene in the network, 
#as well as the genes located in the shortest path between the two target genes. 
#For example, to find the connectivity of POLR3E (Entrez ID 55718) to all other genes in the genome, one would run:
#python nearest_neighbor_weighted_allgenes.py POLR3E_55718

#Import modules
import networkx
import pandas as pd

In [2]:
networ_file_path = '/data/3q29_network/RNASeq_network/brain.degnorm-ge2.prob-gept02.dat'
gene_file_path   = '/data/3q29_network/RNASeq_network/brain.genes.protein-coding.txt'
#Open network file (fromat: Entrez_ID_1, Entrez_ID_2, Weighted_connectivity) and list of genes in network (Entrez IDs in column 1, gene names in column 3)
df_network = pd.read_csv(networ_file_path, sep='\t', header=None, names=['gene1','gene2', 'weight'])
df_gene = pd.read_csv(gene_file_path, sep='\t', header=None, 
                      names=['entrez', '1', 'symbol', 'chrom', 'start', 'stop'])

In [3]:
df_gene.entrez = df_gene.entrez.astype(str)
df_gene = df_gene.set_index('entrez', drop=False)

In [4]:
df_network.gene1 = df_network.gene1.astype(str)
df_network.gene2 = df_network.gene2.astype(str)

In [5]:
#Populate list of protein-coding genes in the network, and dictionary for Entrez-gene symbol conversion

coding_genes = df_gene.entrez.tolist()
entrez_dict = df_gene['symbol'].to_dict()

In [8]:
#Populate network data for each gene with NetworkX
network_graph=networkx.Graph()
for i in df_network.index:
    node1=df_network.at[i, 'gene1']
    node2=df_network.at[i, 'gene2']
    weight=1/float(df_network.at[i, 'weight']) #Take inverse of weight for edge lengths

    #Add edge to NetworkX network if both genes are in protein-coding list
    if node1 in coding_genes:
        if node2 in coding_genes:
            network_graph.add_edge(node1,node2,weight=weight)

In [14]:
nodes = list(network_graph.nodes)

In [15]:
total = df_gene.shape[0]

for gene_start_target in df_gene.index:
    if i % 100 == 0:
        print('{}/{}'.format(i, total))
    i = i + 1
    
    gene_start_symbol = df_gene.at[gene_start_target, 'symbol']
    if gene_start_target not in nodes:
        continue

    results = networkx.single_source_dijkstra(network_graph, gene_start_target)
    lengths = results[0]
    connections = results[1]


    output = []
    output.append('\t'.join(["Start gene","End gene","Shortest distance","Number of connector genes","Connector genes"]) + '\n')

    for key in list(lengths.keys()):
        line = [gene_start_symbol]

        gene_end_symbol = entrez_dict[key]
        line.append(gene_end_symbol)
        line.append('{0:4f}'.format(lengths[key]))
        line.append('{}'.format(len(connections[key])-2))

        connector_symbols = []
        for connector_entrez in connections[key]:
            connector_symbols.append(entrez_dict[connector_entrez])

        if len(connector_symbols[1:-1]) > 0:
            connector_symbols = ','.join(connector_symbols[1:-1])
        else:
            connector_symbols = ''
        line.append(connector_symbols)
        line = '\t'.join(line) + '\n'

        output.append(line)

    with open('test/{}_nearest_neighbor_weighted.txt'.format(gene_start_symbol), 'w') as f:
        f.writelines(output)

6400/18827
6500/18827


KeyboardInterrupt: 