# Downloading/Pre-processing KEGG gene lists

In [119]:
from urllib.request import urlopen
import numpy as np
import pandas as pd
import pickle

## I. Get all human pathway IDs

In [74]:
list_human_KEGG_pathways_url = "http://rest.kegg.jp/list/pathway/hsa"
human_KEGG_pathways = urlopen(list_human_KEGG_pathways_url, timeout=20).read().decode()
lines = human_KEGG_pathways.split(" - Homo sapiens (human)\n")
lines = [line.split('\t')[0] for line in lines]
lines = lines[:-1]  # last entry is empty string
pathway_ids = [line.split(':')[1] for line in lines]

print(str(len(pathway_ids))+" pathways found\n")
print(pathway_ids)

319 pathways found

['hsa00010', 'hsa00020', 'hsa00030', 'hsa00040', 'hsa00051', 'hsa00052', 'hsa00053', 'hsa00061', 'hsa00062', 'hsa00071', 'hsa00072', 'hsa00100', 'hsa00120', 'hsa00130', 'hsa00140', 'hsa00190', 'hsa00220', 'hsa00230', 'hsa00232', 'hsa00240', 'hsa00250', 'hsa00260', 'hsa00270', 'hsa00280', 'hsa00290', 'hsa00310', 'hsa00330', 'hsa00340', 'hsa00350', 'hsa00360', 'hsa00380', 'hsa00400', 'hsa00410', 'hsa00430', 'hsa00440', 'hsa00450', 'hsa00471', 'hsa00472', 'hsa00480', 'hsa00500', 'hsa00510', 'hsa00511', 'hsa00512', 'hsa00514', 'hsa00515', 'hsa00520', 'hsa00524', 'hsa00531', 'hsa00532', 'hsa00533', 'hsa00534', 'hsa00561', 'hsa00562', 'hsa00563', 'hsa00564', 'hsa00565', 'hsa00590', 'hsa00591', 'hsa00592', 'hsa00600', 'hsa00601', 'hsa00603', 'hsa00604', 'hsa00620', 'hsa00630', 'hsa00640', 'hsa00650', 'hsa00670', 'hsa00730', 'hsa00740', 'hsa00750', 'hsa00760', 'hsa00770', 'hsa00780', 'hsa00785', 'hsa00790', 'hsa00830', 'hsa00860', 'hsa00900', 'hsa00910', 'hsa00920', 'hsa009

## II. Get Annotations for each KEGG pathway

In [82]:
def get_KEGG_pathway_data(pathway_id):
    pathway_annotations_url = "http://rest.kegg.jp/get/"+pathway_id
    response = urlopen(pathway_annotations_url, timeout=20).read().decode()
    lines = response.split('\n') 
    metadata = [line[12:].split(' - Homo')[0] for line in lines if line.startswith('NAME') or line.startswith('CLASS')]
    
    print(metadata)
    
    if len(metadata) < 2: 
        print("\nNO CATEGORY PROVIDED FOR PATHWAY "+pathway_id+"\n")
        return
    
    start = response.find("GENE")
    end = response.find("COMPOUND")
    if end == -1: end = response.find("KO_PATHWAY")
    if end == -1: end = response.find("REFERENCE")
    if end == -1: print(url_path_genes)
    # adjust
    end -= 1
    
    gene_text = response[start:end]
    lines = gene_text.split('\n')
    genes = []
    for line in lines:
        if len(line.split(';')) > 1:
            tmp = line.split(';')[0]
            tmp = tmp.lstrip(' ')
            gene = tmp.split(' ')[-1]
            genes.append(gene) 
    
    
    return {'id': pathway_id, 'name': metadata[0], 'class': metadata[1], 'genes': genes}

pathways = [get_KEGG_pathway_data(pathway_id) for pathway_id in pathway_ids]

['Glycolysis / Gluconeogenesis', 'Metabolism; Carbohydrate metabolism']
['Citrate cycle (TCA cycle)', 'Metabolism; Carbohydrate metabolism']
['Pentose phosphate pathway', 'Metabolism; Carbohydrate metabolism']
['Pentose and glucuronate interconversions', 'Metabolism; Carbohydrate metabolism']
['Fructose and mannose metabolism', 'Metabolism; Carbohydrate metabolism']
['Galactose metabolism', 'Metabolism; Carbohydrate metabolism']
['Ascorbate and aldarate metabolism', 'Metabolism; Carbohydrate metabolism']
['Fatty acid biosynthesis', 'Metabolism; Lipid metabolism']
['Fatty acid elongation', 'Metabolism; Lipid metabolism']
['Fatty acid degradation', 'Metabolism; Lipid metabolism']
['Synthesis and degradation of ketone bodies', 'Metabolism; Lipid metabolism']
['Steroid biosynthesis', 'Metabolism; Lipid metabolism']
['Primary bile acid biosynthesis', 'Metabolism; Lipid metabolism']
['Ubiquinone and other terpenoid-quinone biosynthesis', 'Metabolism; Metabolism of cofactors and vitamins']
['

['PPAR signaling pathway', 'Organismal Systems; Endocrine system']
['Base excision repair', 'Genetic Information Processing; Replication and repair']
['Nucleotide excision repair', 'Genetic Information Processing; Replication and repair']
['Mismatch repair', 'Genetic Information Processing; Replication and repair']
['Homologous recombination', 'Genetic Information Processing; Replication and repair']
['Non-homologous end-joining', 'Genetic Information Processing; Replication and repair']
['Fanconi anemia pathway', 'Genetic Information Processing; Replication and repair']
['MAPK signaling pathway', 'Environmental Information Processing; Signal transduction']
['ErbB signaling pathway', 'Environmental Information Processing; Signal transduction']
['Ras signaling pathway', 'Environmental Information Processing; Signal transduction']
['Rap1 signaling pathway', 'Environmental Information Processing; Signal transduction']
['Calcium signaling pathway', 'Environmental Information Processing; Si

['Inflammatory mediator regulation of TRP channels', 'Organismal Systems; Sensory system']
['Regulation of actin cytoskeleton', 'Cellular Processes; Cell motility']
['Insulin signaling pathway', 'Organismal Systems; Endocrine system']
['Insulin secretion', 'Organismal Systems; Endocrine system']
['GnRH signaling pathway', 'Organismal Systems; Endocrine system']
['Ovarian steroidogenesis', 'Organismal Systems; Endocrine system']
['Progesterone-mediated oocyte maturation', 'Organismal Systems; Endocrine system']
['Estrogen signaling pathway', 'Organismal Systems; Endocrine system']
['Melanogenesis', 'Organismal Systems; Endocrine system']
['Prolactin signaling pathway', 'Organismal Systems; Endocrine system']
['Thyroid hormone synthesis', 'Organismal Systems; Endocrine system']
['Thyroid hormone signaling pathway', 'Organismal Systems; Endocrine system']
['Adipocytokine signaling pathway', 'Organismal Systems; Endocrine system']
['Oxytocin signaling pathway', 'Organismal Systems; Endocri

## III. Build a dataframe

In [93]:
pathways_df = pd.DataFrame([pathway for pathway in pathways if pathway != None])
pathways_df.head()

Unnamed: 0,class,genes,id,name
0,Metabolism; Carbohydrate metabolism,"[HK3, HK1, HK2, HKDC1, GCK, GPI, PFKM, PFKP, P...",hsa00010,Glycolysis / Gluconeogenesis
1,Metabolism; Carbohydrate metabolism,"[CS, ACLY, ACO2, ACO1, IDH1, IDH2, IDH3B, IDH3...",hsa00020,Citrate cycle (TCA cycle)
2,Metabolism; Carbohydrate metabolism,"[GPI, G6PD, PGLS, H6PD, PGD, RPE, RPEL1, TKT, ...",hsa00030,Pentose phosphate pathway
3,Metabolism; Carbohydrate metabolism,"[GUSB, KL, UGT2A1, UGT2A3, UGT2B17, UGT2B11, U...",hsa00040,Pentose and glucuronate interconversions
4,Metabolism; Carbohydrate metabolism,"[MPI, PMM2, PMM1, GMPPB, GMPPA, GMDS, TSTA3, F...",hsa00051,Fructose and mannose metabolism


In [108]:
pathways_df['class'], pathways_df['subclass'] = pathways_df['class'].str.split(';', 1).str
pathways_df

Unnamed: 0,class,genes,id,name,subclass
0,Metabolism,"[HK3, HK1, HK2, HKDC1, GCK, GPI, PFKM, PFKP, P...",hsa00010,Glycolysis / Gluconeogenesis,Carbohydrate metabolism
1,Metabolism,"[CS, ACLY, ACO2, ACO1, IDH1, IDH2, IDH3B, IDH3...",hsa00020,Citrate cycle (TCA cycle),Carbohydrate metabolism
2,Metabolism,"[GPI, G6PD, PGLS, H6PD, PGD, RPE, RPEL1, TKT, ...",hsa00030,Pentose phosphate pathway,Carbohydrate metabolism
3,Metabolism,"[GUSB, KL, UGT2A1, UGT2A3, UGT2B17, UGT2B11, U...",hsa00040,Pentose and glucuronate interconversions,Carbohydrate metabolism
4,Metabolism,"[MPI, PMM2, PMM1, GMPPB, GMPPA, GMDS, TSTA3, F...",hsa00051,Fructose and mannose metabolism,Carbohydrate metabolism
5,Metabolism,"[GALM, GALK1, GALT, GALE, UGP2, PGM1, PGM2, HK...",hsa00052,Galactose metabolism,Carbohydrate metabolism
6,Metabolism,"[UGDH, UGT2A1, UGT2A3, UGT2B17, UGT2B11, UGT2B...",hsa00053,Ascorbate and aldarate metabolism,Carbohydrate metabolism
7,Metabolism,"[ACACA, ACACB, MCAT, FASN, OXSM, OLAH, ACSL6, ...",hsa00061,Fatty acid biosynthesis,Lipid metabolism
8,Metabolism,"[ACAA2, HADHB, HADH, HADHA, ECHS1, MECR, PPT1,...",hsa00062,Fatty acid elongation,Lipid metabolism
9,Metabolism,"[ACAT2, ACAT1, ACAA1, ACAA2, HADHB, HADH, HADH...",hsa00071,Fatty acid degradation,Lipid metabolism


In [111]:
pathways_df = pathways_df.set_index(['class', 'subclass', 'id'])
pathways_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,genes,name
class,subclass,id,Unnamed: 3_level_1,Unnamed: 4_level_1
Metabolism,Carbohydrate metabolism,hsa00010,"[HK3, HK1, HK2, HKDC1, GCK, GPI, PFKM, PFKP, P...",Glycolysis / Gluconeogenesis
Metabolism,Carbohydrate metabolism,hsa00020,"[CS, ACLY, ACO2, ACO1, IDH1, IDH2, IDH3B, IDH3...",Citrate cycle (TCA cycle)
Metabolism,Carbohydrate metabolism,hsa00030,"[GPI, G6PD, PGLS, H6PD, PGD, RPE, RPEL1, TKT, ...",Pentose phosphate pathway
Metabolism,Carbohydrate metabolism,hsa00040,"[GUSB, KL, UGT2A1, UGT2A3, UGT2B17, UGT2B11, U...",Pentose and glucuronate interconversions
Metabolism,Carbohydrate metabolism,hsa00051,"[MPI, PMM2, PMM1, GMPPB, GMPPA, GMDS, TSTA3, F...",Fructose and mannose metabolism
Metabolism,Carbohydrate metabolism,hsa00052,"[GALM, GALK1, GALT, GALE, UGP2, PGM1, PGM2, HK...",Galactose metabolism
Metabolism,Carbohydrate metabolism,hsa00053,"[UGDH, UGT2A1, UGT2A3, UGT2B17, UGT2B11, UGT2B...",Ascorbate and aldarate metabolism
Metabolism,Lipid metabolism,hsa00061,"[ACACA, ACACB, MCAT, FASN, OXSM, OLAH, ACSL6, ...",Fatty acid biosynthesis
Metabolism,Lipid metabolism,hsa00062,"[ACAA2, HADHB, HADH, HADHA, ECHS1, MECR, PPT1,...",Fatty acid elongation
Metabolism,Lipid metabolism,hsa00071,"[ACAT2, ACAT1, ACAA1, ACAA2, HADHB, HADH, HADH...",Fatty acid degradation


## IV. Normalize Gene Lists

In [121]:
def flatten(list_of_lists): return [item for sublist in list_of_lists for item in sublist]

In [123]:
unique_gene_symbols = np.unique(flatten(pathways_df.genes.values))
unique_gene_symbols

array(['A2M', 'A3GALT2', 'A4GALT', ..., 't(8', 't(9', 't(x'],
      dtype='<U60')

In [130]:
import mygene
mg = mygene.MyGeneInfo()
df = mg.querymany(unique_gene_symbols.tolist(), scopes=['symbol', 'name', 'alias'], fields=["HGNC", "symbol"], species="hg19", as_dataframe=True, returnall=True)

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-7235...done.
Finished.
7171 input query terms found dup hits:
	[('A2M', 10), ('A3GALT2', 10), ('A4GALT', 10), ('AAAS', 10), ('AACS', 10), ('AADAT', 10), ('AANAT', 
2 input query terms found no hit:
	['DOI:10.1002/1097-0029(20010215)52:4<450::AID-JEMT1030>3.0.CO', 'DOI:10.1002/1521-1878(200008)22:8<


In [131]:
dup = df['dup']
missing = df['missing']
df = df['out']
df = df[["HGNC", "symbol"]].dropna()
df.head()

Unnamed: 0_level_0,HGNC,symbol
query,Unnamed: 1_level_1,Unnamed: 2_level_1
A2M,7,A2M
A3GALT2,30005,A3GALT2
A4GALT,18149,A4GALT
AAAS,13666,AAAS
AACS,21298,AACS


In [132]:
df = df.reset_index().drop_duplicates(subset='query', keep='first').set_index('query').rename_axis(None)
df.head()

Unnamed: 0,HGNC,symbol
A2M,7,A2M
A3GALT2,30005,A3GALT2
A4GALT,18149,A4GALT
AAAS,13666,AAAS
AACS,21298,AACS


In [128]:
missing

['DOI:10.1002/1097-0029(20010215)52:4<450::AID-JEMT1030>3.0.CO',
 'DOI:10.1002/1521-1878(200008)22:8<761::AID-BIES10>3.0.CO']

In [133]:
len(df[df.index != df.symbol])

6

In [134]:
df[df.index != df.symbol]

Unnamed: 0,HGNC,symbol
POP3,17649,POPDC3
t(12,12040,TRAJ12
t(4,11603,TBX4
t(8,12096,TRAJ8
t(9,12097,TRAJ9
t(x,48335,TCP11X2


In [216]:
df = df.rename(columns={'symbol':'new_name'})

In [158]:
pathways_df.index.get_level_values(2)

Index(['hsa00010', 'hsa00020', 'hsa00030', 'hsa00040', 'hsa00051', 'hsa00052',
       'hsa00053', 'hsa00061', 'hsa00062', 'hsa00071',
       ...
       'hsa05322', 'hsa05323', 'hsa05330', 'hsa05332', 'hsa05340', 'hsa05410',
       'hsa05412', 'hsa05414', 'hsa05416', 'hsa05418'],
      dtype='object', name='id', length=318)

In [175]:
pathways_df = pathways_df.sort_index()

### Let's quickly get rid of these "Metabolism: Overview" Pathways...

In [191]:
pathways_df.loc[pd.IndexSlice[['Metabolism'],[' Overview'], :], :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,genes,name
class,subclass,id,Unnamed: 3_level_1,Unnamed: 4_level_1
Metabolism,Overview,hsa01200,[],Carbon metabolism
Metabolism,Overview,hsa01210,[],2-Oxocarboxylic acid metabolism
Metabolism,Overview,hsa01212,[],Fatty acid metabolism
Metabolism,Overview,hsa01230,[],Biosynthesis of amino acids


In [193]:
pathways_df = pathways_df.drop(' Overview', level=1)

### Let's remove the bogus genes 
There are only two of them.

In [227]:
for pathway_id in pathways_df.index.get_level_values(2):
    without_bogus = [gene for gene in pathways_df.loc[pd.IndexSlice[:,:,[pathway_id]], ['genes']].values[0][0] if gene not in missing]
    pathways_df.loc[pd.IndexSlice[:,:,[pathway_id]], ['genes']].values[0][0] = without_bogus

### Let's convert the old names to new names...

In [236]:
for pathway_id in pathways_df.index.get_level_values(2):
    new_names = [df.loc[old_name]['new_name'] for old_name in pathways_df.loc[pd.IndexSlice[:,:,[pathway_id]], ['genes']].values[0][0] if df.index.contains(old_name)]
    pathways_df.loc[pd.IndexSlice[:,:,[pathway_id]], ['genes']].values[0][0] = new_names

### Hooray! 

In [238]:
pathways_df.to_pickle('KEGG_df.pickle')