# Protein data collection data from KEGG database

In [1]:
# Standard library packages
import io

# Import Biopython modules to interact with KEGG
from Bio import SeqIO
from Bio.KEGG import REST

# Import Pandas, so we can use dataframes
import pandas as pd

# Some code to return a Pandas dataframe, given tabular text
def to_df(result):
    return pd.read_table(io.StringIO(result), header=None)

## 1. Retriving Oryza Sativa (Japonica group) via KEGG API

In [2]:
# Get reactions involved with fatty-acid biosynthesis
gene_pathway = REST.kegg_link("pathway", "dosa").read()
gene_pathway = to_df(gene_pathway)
gene_pathway.columns = ["protein_id", "pathway"]
gene_pathway.set_index("protein_id", inplace=True)

gene_ko = REST.kegg_link("ko", "dosa").read()
gene_ko = to_df(gene_ko)
gene_ko.columns = ["protein_id", "ko"]
gene_ko.set_index("protein_id", inplace=True)

gene_ec = REST.kegg_link("ec", "dosa").read()
gene_ec = to_df(gene_ec)
gene_ec.columns = ["protein_id", "ec"]
gene_ec.set_index("protein_id", inplace=True)

## 2. Merging table of ec, ko, pathway with same protein id

In [3]:
tbl_merge = pd.merge(gene_ec, gene_pathway, left_index=True, right_index=True, how='outer')
tbl_merge = pd.merge(tbl_merge, gene_ko, left_index=True, right_index=True, how='outer')
tbl_merge.head()

Unnamed: 0_level_0,ec,pathway,ko
protein_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dosa:Os01t0100400-01,,,ko:K19791
dosa:Os01t0100700-01,,path:dosa03010,ko:K02989
dosa:Os01t0100900-01,ec:4.1.2.27,path:dosa00600,ko:K01634
dosa:Os01t0100900-01,ec:4.1.2.27,path:dosa01100,ko:K01634
dosa:Os01t0101200-01,ec:3.1.3.77,path:dosa00270,ko:K09880


In [4]:
ec_desc = REST.kegg_list("ec").read()
ec_desc = to_df(ec_desc)
ec_desc.columns = ["ec", "ec_desc"]
ec_desc.set_index("ec", inplace=True)
ec_desc.head()

Unnamed: 0_level_0,ec_desc
ec,Unnamed: 1_level_1
ec:1.1.1.1,alcohol dehydrogenase; aldehyde reductase; ADH...
ec:1.1.1.2,alcohol dehydrogenase (NADP+); aldehyde reduct...
ec:1.1.1.3,homoserine dehydrogenase; HSDH; HSD
ec:1.1.1.4,"(R,R)-butanediol dehydrogenase; butyleneglycol..."
ec:1.1.1.5,Transferred to 1.1.1.303 and 1.1.1.304


## 3. Adding EC description into merged table

In [5]:
tbl_merge = tbl_merge.join(ec_desc, on=['ec'], how='inner')
tbl_merge.head()

Unnamed: 0_level_0,ec,pathway,ko,ec_desc
protein_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dosa:Os01t0100900-01,ec:4.1.2.27,path:dosa00600,ko:K01634,sphinganine-1-phosphate aldolase; dihydrosphin...
dosa:Os01t0100900-01,ec:4.1.2.27,path:dosa01100,ko:K01634,sphinganine-1-phosphate aldolase; dihydrosphin...
dosa:Os01t0101200-01,ec:3.1.3.77,path:dosa00270,ko:K09880,acireductone synthase; E1; E-1 enolase-phospha...
dosa:Os01t0101200-01,ec:3.1.3.77,path:dosa01100,ko:K09880,acireductone synthase; E1; E-1 enolase-phospha...
dosa:Os11t0484000-01,ec:3.1.3.77,path:dosa00270,ko:K16054,acireductone synthase; E1; E-1 enolase-phospha...


In [6]:
## 4. Exporting merged table into tab-delimited file
tbl_merge.to_csv("Proteins_Osa.txt", sep='\t')

# Extract protein sequence from FASTA file