# KEGG processing

Process some KEGG files so that we can do pathway annotations in the main notebook.

# Libraries

In [1]:
# Libraries


# Pandas.
import pandas as pd

# Requests.
import requests

# Delay
import time

In [2]:
# Set the data folder.
data_folder = '/media/apollo/Samsung_T5/transfer/mayur/'
# data_folder = '/home/mad1188/rvallsamples/'

# Generate all NCBI gene IDs from the official gene names given at HGNC

Simply load the HGNC file and strip out just what we need.

In [3]:
# Load the file.
loaded = pd.read_csv(
    data_folder + 'results.txt',
    sep = '\t'
)

# Keep only what we need.
loaded = loaded[['Approved symbol', 'NCBI gene ID']]

# Convert NCBI gene IDs to ints.
converted = []

for i in loaded['NCBI gene ID']:
    try:
        converted.append(int(i))
    except ValueError:
        converted.append('NA')

# Assign.
loaded['NCBI gene ID'] = converted

# Clean up the column names for writing.
loaded.rename(columns={'Approved symbol': 'symbol', 'NCBI gene ID': 'ncbi_gene_id'}, inplace=True)

In [4]:
loaded

Unnamed: 0,symbol,ncbi_gene_id
0,A1BG,1
1,A1BG-AS1,503538
2,A1CF,29974
3,A2M,2
4,A2M-AS1,144571
...,...,...
45632,ZYG11B,79699
45633,ZYX,7791
45634,ZYXP1,106480342
45635,ZZEF1,23140


Write it out.

In [5]:
# To file.
loaded.to_csv(
    data_folder + 'symbol_to_ncbi_gene_id.tsv',
    index = None,
    sep = '\t'
)

# Get all KGML files from KEGG

Get the unique pathway identifiers from KEGG, then ask for the KGML files (there aren't that many).

In [6]:
# Load the file.
loaded_two = pd.read_csv(
    data_folder + 'kegg_gene_ids_to_pathway_ids.tsv',
    header = None,
    sep = '\t'
)

# Clean up the column names
loaded_two.rename(columns={0: 'kegg_gene_id', 1: 'kegg_pathway_id'}, inplace=True)

In [7]:
loaded_two

Unnamed: 0,kegg_gene_id,kegg_pathway_id
0,hsa:10327,path:hsa00010
1,hsa:124,path:hsa00010
2,hsa:125,path:hsa00010
3,hsa:126,path:hsa00010
4,hsa:127,path:hsa00010
...,...,...
36702,hsa:91860,path:hsa05418
36703,hsa:92,path:hsa05418
36704,hsa:93,path:hsa05418
36705,hsa:9446,path:hsa05418


Before looking for pathways, write out all unique KEGG gene IDs.

In [8]:
# All unique gene IDs.
unique_kegg_gene_ids = list(loaded.kegg_gene_id.unique())

# Write them out.
with open(data_folder + 'unique_kegg_gene_ids.tsv', 'w') as f:
    for i in unique_kegg_gene_ids:
        f.write(i + '\n')

AttributeError: 'DataFrame' object has no attribute 'kegg_gene_id'

It appears that KEGG uses the NCBI IDs in the hsa:id format, so let's join to get gene symbols tied to pathways.

In [9]:
# Copy and re-name.
loaded_copy = loaded.copy()
loaded_copy.rename(
    columns = {
        "ncbi_gene_id": "kegg_gene_id"
    },
    inplace = True
)

# Create the right ID format.
loaded_copy.kegg_gene_id = ['hsa:' + str(i) for i in list(loaded_copy.kegg_gene_id)]

In [10]:
loaded_copy

Unnamed: 0,symbol,kegg_gene_id
0,A1BG,hsa:1
1,A1BG-AS1,hsa:503538
2,A1CF,hsa:29974
3,A2M,hsa:2
4,A2M-AS1,hsa:144571
...,...,...
45632,ZYG11B,hsa:79699
45633,ZYX,hsa:7791
45634,ZYXP1,hsa:106480342
45635,ZZEF1,hsa:23140


Write them out.

In [11]:
# To file.
loaded_copy.to_csv(
    data_folder + 'symbols_to_kegg_gene_ids.tsv',
    index = None,
    sep = '\t'
)

In [12]:
# Join to get symbols and pathways in one place.
joined = loaded_two.merge(loaded_copy, on = 'kegg_gene_id')

In [13]:
joined

Unnamed: 0,kegg_gene_id,kegg_pathway_id,symbol
0,hsa:10327,path:hsa00010,AKR1A1
1,hsa:124,path:hsa00010,ADH1A
2,hsa:125,path:hsa00010,ADH1B
3,hsa:126,path:hsa00010,ADH1C
4,hsa:127,path:hsa00010,ADH4
...,...,...,...
36540,hsa:91860,path:hsa05418,CALML4
36541,hsa:92,path:hsa05418,ACVR2A
36542,hsa:93,path:hsa05418,ACVR2B
36543,hsa:9446,path:hsa05418,GSTO1


Write it out.

In [14]:
joined.to_csv(
    data_folder + 'kegg_gene_pathway_symbol.tsv',
    index = None,
    sep = '\t'
)

Some KEGG IDs are missing, what are they?

In [59]:
set(loaded_two.kegg_gene_id).difference(set(joined.kegg_gene_id))

{'hsa:100653049',
 'hsa:101929601',
 'hsa:101929627',
 'hsa:102723407',
 'hsa:102723475',
 'hsa:102723532',
 'hsa:102723996',
 'hsa:102724334',
 'hsa:102724428',
 'hsa:102724560',
 'hsa:102724594',
 'hsa:102724652',
 'hsa:102725035',
 'hsa:107080638',
 'hsa:107987478',
 'hsa:107987479',
 'hsa:107987545',
 'hsa:110116772',
 'hsa:111089941',
 'hsa:112268384',
 'hsa:122539214',
 'hsa:124900488',
 'hsa:124900516',
 'hsa:124900632',
 'hsa:124900883',
 'hsa:124900887',
 'hsa:124900915',
 'hsa:124900922',
 'hsa:124901219',
 'hsa:124901220',
 'hsa:124901221',
 'hsa:124901224',
 'hsa:124901505',
 'hsa:124901506',
 'hsa:124901507',
 'hsa:124901508',
 'hsa:124901509',
 'hsa:124901516',
 'hsa:124901858',
 'hsa:124901859',
 'hsa:124901860',
 'hsa:124901863',
 'hsa:124902100',
 'hsa:124902101',
 'hsa:124902331',
 'hsa:124902335',
 'hsa:124902595',
 'hsa:124902596',
 'hsa:124902847',
 'hsa:124903120',
 'hsa:124903414',
 'hsa:124903435',
 'hsa:124903436',
 'hsa:124903594',
 'hsa:124903595',
 'hsa:1249

What are the unique pathways?

In [61]:
# Unique pathways.
unique_pathways = set(joined.kegg_pathway_id)

In [63]:
len(unique_pathways)

358

Now get every KGML file.

In [23]:
# Request every KGML file.
for p_id in unique_pathways:

    # Ask for the KGML file.
    r = requests.get('https://rest.kegg.jp/get/' + p_id.split(':')[1] + '/kgml')

    # Save the KGML file.
    with open(data_folder + 'kgml_files/' + p_id.split(':')[1] + '.xml', 'w') as f:
        f.write(r.text)

    # Wait 1 second to send the next request.
    time.sleep(1)

Manual file count of the data folder shows that we got every KGML file.