# Create a gene family network and Entrez Gene mapping

In [1]:
import pandas
import networkx

In [2]:
hgnc_df = (pandas.read_table('download/hgnc_complete_set.txt', low_memory=False)
    .dropna(subset=['entrez_id', 'gene_family_id'])
    .astype({'entrez_id': int})
    [['entrez_id', 'symbol', 'name', 'locus_group', 'gene_family_id']]
)
entrez_id_to_symbol = dict(zip(hgnc_df.entrez_id, hgnc_df.symbol))
hgnc_df.head()

Unnamed: 0,entrez_id,symbol,name,locus_group,gene_family_id
0,1,A1BG,alpha-1-B glycoprotein,protein-coding gene,594
2,29974,A1CF,APOBEC1 complementation factor,protein-coding gene,725
3,2,A2M,alpha-2-macroglobulin,protein-coding gene,1234
5,144568,A2ML1,alpha-2-macroglobulin like 1,protein-coding gene,1234
9,127550,A3GALT2,"alpha 1,3-galactosyltransferase 2",protein-coding gene,429


In [3]:
hgnc_df.columns

Index(['entrez_id', 'symbol', 'name', 'locus_group', 'gene_family_id'], dtype='object')

In [4]:
family_df = (
    pandas.read_csv('download/genefamily_db_tables/family.csv')
    [['id', 'abbreviation', 'name']]
)
family_df.head()

Unnamed: 0,id,abbreviation,name
0,905,ANGPTL,Angiopoietin like
1,222,,Bombesin receptors
2,767,TCTN,Tectonic proteins
3,550,FABP,Fatty acid binding protein family
4,1317,,Carcinoembryonic antigen family


In [5]:
families = networkx.DiGraph()

# Nodes
for row in family_df.itertuples():
    families.add_node(row.id, name=row.name, entrez_gene_ids=set())

# Edges (from superfamily to subfamily)
df = pandas.read_csv('download/genefamily_db_tables/hierarchy.csv')
for row in df.itertuples():
    families.add_edge(row.parent_fam_id, row.child_fam_id)

networkx.is_directed_acyclic_graph(families)

True

In [6]:
for node, data in families.nodes(data=True):
    data['root'] = families.in_degree(node) == 0
    data['leaf'] = families.out_degree(node) == 0

In [7]:
root_nodes = [n for n, d in families.in_degree().items() if d == 0]
root_df = family_df.query("id in @root_nodes")

In [8]:
# Add propaged entrez gene IDs
for row in hgnc_df.itertuples():
    for family_id in row.gene_family_id.split('|'):
        family_id = int(family_id)
        for family_id in {family_id} | networkx.ancestors(families, family_id):
            node_data = families.node[family_id]
            entrez_id = int(row.entrez_id)
            node_data['entrez_gene_ids'].add(entrez_id)

In [9]:
rows = list()
for family_id, data in families.nodes(data=True):
    entrez_gene_ids = sorted(data['entrez_gene_ids'])
    for entrez_gene_id in entrez_gene_ids:
        rows.append([family_id, data['name'], int(data['root']), int(data['leaf']), entrez_gene_id, entrez_id_to_symbol[entrez_gene_id]])
    # Convert entrez_gene_ids to a string for GraphML export
    data['entrez_gene_ids'] = '|'.join(map(str, entrez_gene_ids))
columns = ['family_id', 'family_name', 'root', 'leaf', 'entrez_gene_id', 'gene_symbol']
mapping_df = pandas.DataFrame(rows, columns=columns).sort_values(['family_id', 'entrez_gene_id'])
mapping_df.head(3)

Unnamed: 0,family_id,family_name,root,leaf,entrez_gene_id,gene_symbol
0,3,Fascins,1,1,6624,FSCN1
1,3,Fascins,1,1,25794,FSCN2
2,3,Fascins,1,1,29999,FSCN3


In [10]:
# Write family graph to GraphML (XML format)
networkx.write_graphml(families, 'data/families.graphml')

In [11]:
# Export family to gene mapping to TSV
mapping_df.to_csv('data/gene-families.tsv', sep='\t', index=False)