# Creating genes mapping table

# I / Install missing modules & load modules 

In [1]:
import pandas as pd
import numpy as np
import cobra
import requests
import sys

In [2]:
from urllib.error import HTTPError

In [3]:
#If this doesn't run install pandarallel in your conda enviroment using conda or pip
#https://github.com/nalepae/pandarallel 

# => Problem to install pandarallel => seems to work with Python version 3.{5, 6, 7, 8}, not 3.9
# => Create a new environment containing the same packages, but changing the version of Python : Python 3.8
# Then, try to install pandarallel

# => For this gene mapping, use the environment named env_aracore_update_gene_mapping => same problem
# same problem for all the versions of python...

from pandarallel import pandarallel 

pandarallel.initialize(progress_bar=True)

from pandarallel.utils import progress_bars
progress_bars.is_notebook_lab = lambda : True

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


# II / Load AraCore model 

In [4]:
#Get file from github
fileName = 'https://raw.githubusercontent.com/ma-blaetke/CBM_C3_C4_Metabolism/master/data/2018-23-05-mb-genC3.sbml'
r = requests.get(fileName)

In [5]:
#Create model
model = cobra.io.read_sbml_model(r.text)

In [6]:
model

0,1
Name,c3_model
Memory address,0x07f427cd6d550
Number of metabolites,413
Number of reactions,572
Number of groups,0
Objective expression,1.0*Ex_Suc - 1.0*Ex_Suc_reverse_fb96e
Compartments,"Chloroplast, Lumen, Cytosol, Mitochondrion, IntermembraneSpace, Peroxisome"


# III / Create Gene Table for AraCore Model  

In [7]:
#Create mapping table
df_genes_aracore = pd.DataFrame(
    {
        "aracore_ids" : [gene_id.id for gene_id in model.genes],
        "aracore_name" : [gene_id.name for gene_id in model.genes],
        "aracore_annotations" : [gene_id.annotation for gene_id in model.genes]
    })

df_genes_aracore.head(25) 

Unnamed: 0,aracore_ids,aracore_name,aracore_annotations
0,AT4G05180,AT4G05180,{}
1,AT2G30570,AT2G30570,{}
2,ATCG00560,ATCG00560,{}
3,AT4G21280,AT4G21280,{}
4,ATCG00300,ATCG00300,{}
5,AT1G06680,AT1G06680,{}
6,ATCG00080,ATCG00080,{}
7,AT2G06520,AT2G06520,{}
8,AT1G79040,AT1G79040,{}
9,AT5G66570,AT5G66570,{}


In [8]:
df_genes_aracore['aracore_ids'].shape[0] #804 genes

804

# IV / Retrieve additional information on genes using the ENSEMBL API 

**ENSEMBL** is a database that provides information about **genomic features**. It contains the IDs and the symbol of the genes, as well as the species.  

In [9]:
#EXAMPLE REQUEST:

#Retrieve Information on gene symbol, location (chromosome, start and end), uniprot id/descriptive name using the ENSEMBL REST API
#https://rest.ensembl.org/
#https://rest.ensembl.org/documentation/info/lookup

gene_id = 'AT4G05180'

#ENSEMBL Server Url
server = "https://rest.ensembl.org"

#URL specification to get meta-information of a gene by its id
ext = f"/lookup/id/{gene_id}?expand=1"
  
#Request information from URL in json format
r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})

#Check if request wad successful
if r.ok:
    
    #Get json (dictionary-like structure) from request
    json_anno = r.json()

else:

    json_anno = None

json_anno

{'strand': -1,
 'start': 2671523,
 'species': 'arabidopsis_thaliana',
 'description': 'Oxygen-evolving enhancer protein 3-2, chloroplastic [Source:UniProtKB/Swiss-Prot;Acc:Q41932]',
 'object_type': 'Gene',
 'display_name': 'PSBQ2',
 'Transcript': [{'assembly_name': 'TAIR10',
   'biotype': 'protein_coding',
   'end': 2673243,
   'source': 'araport11',
   'is_canonical': 1,
   'db_type': 'core',
   'Parent': 'AT4G05180',
   'Translation': {'end': 2673170,
    'id': 'AT4G05180.1',
    'length': 230,
    'db_type': 'core',
    'Parent': 'AT4G05180.1',
    'object_type': 'Translation',
    'start': 2672093,
    'species': 'arabidopsis_thaliana'},
   'species': 'arabidopsis_thaliana',
   'start': 2671523,
   'strand': -1,
   'seq_region_name': '4',
   'Exon': [{'end': 2673243,
     'id': 'AT4G05180.1.exon1',
     'assembly_name': 'TAIR10',
     'seq_region_name': '4',
     'db_type': 'core',
     'strand': -1,
     'start': 2672873,
     'species': 'arabidopsis_thaliana',
     'object_type':

In [10]:
#Retrieve Information on gene symbol, location (chromosome, start and end), uniprot id/descriptive name using the ENSEMBL REST API
#https://rest.ensembl.org/
#https://rest.ensembl.org/documentation/info/lookup

def lookup_ensembl_id(gene_id):

    #pandas series to hold information
    ds_anno = pd.Series(index=['symbol','chr','start', 'end', 'description'])
  
    #ENSEMBL Server Url
    server = "https://rest.ensembl.org"
  
    #URL specification to get meta-information of a gene by its id
    ext = f"/lookup/id/{gene_id}?expand=1"
  
  #Request information from URL in json format
    r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
  
  #Check if request wad successful
    if r.ok:
    
        #Get json (dictionary-like structure) from request
        json_anno = r.json()

        #Save relevant information in pandas series
        ds_anno['symbol'] = json_anno.get('display_name', None)
        ds_anno['chr'] = json_anno.get('seq_region_name', None)
        ds_anno['start'] = json_anno.get('start', None)
        ds_anno['end'] = json_anno.get('end', None)
        ds_anno['description'] = json_anno.get('description', None)
    else:
        #Save set pandas series to None if request was not successfull
        ds_anno['symbol'] = None
        ds_anno['chr'] = None
        ds_anno['start'] = None
        ds_anno['end'] = None
        ds_anno['description'] = None

    #return pandas series
    return ds_anno

In [11]:
#EXAMPLE REQUEST

#Retrieve Information on cross references for each gene symbol (NCBI gene id)
#https://rest.ensembl.org/
#https://rest.ensembl.org/documentation/info/xref_id

gene_id = 'AT4G05180'

#ENSEMBL Server Url
server = "https://rest.ensembl.org"

#URL specification to get meta-information of a gene by its id
ext = f"/xrefs/id/{gene_id}?"
  
#Request information from URL in json format
r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})

#Check if request was successful
if r.ok:
    
    #Get json (dictionary-like structure) from request
    json_anno = r.json() # json format file for annotations containing r variable content

    #Convert json to pandas dataframe format to easily retrieve information
    df_anno = pd.DataFrame(json_anno) # dataframe containing annotations
    df_anno.set_index('dbname', inplace=True) # delete the index on the left of the dataframe
else:
     df_anno = None

df_anno

Unnamed: 0_level_0,db_display_name,synonyms,version,info_text,primary_id,display_id,info_type,description,xref_end,xref_identity,ensembl_end,ensembl_identity,score,xref_start,evalue,cigar_line,ensembl_start
dbname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
NASC_GENE_ID,NASC Gene ID,[],0,,AT4G05180,AT4G05180-TAIR-G,DIRECT,photosystem II subunit Q-2,,,,,,,,,
ArrayExpress,Expression Atlas,[],0,,AT4G05180,AT4G05180,DIRECT,,,,,,,,,,
TAIR_LOCUS,TAIR,[],0,,AT4G05180,AT4G05180,DIRECT,photosystem II subunit Q-2,,,,,,,,,
TAIR_SYMBOL,TAIR Gene Name,"[PSBQ-1, PSBQ-2, PSBQA, PSII-Q]",0,,PSBQ,PSBQ,DIRECT,,,,,,,,,,
UniGene,UniGene,[],0,,At.21853,At.21853,SEQUENCE_MATCH,Oxygen-evolving enhancer protein 3-2,1037.0,100.0,1037.0,96.0,5185.0,1.0,,1037M,1.0
EntrezGene,NCBI gene (formerly Entrezgene),[],0,,825866,825866,DEPENDENT,,,,,,,,,,
KNETMINER_ARA,KNETMINER_ARA,[],0,,AT4G05180,AT4G05180,DEPENDENT,,,,,,,,,,


In [12]:
#Retrieve Information on cross references for each gene symbol (NCBI gene id)
#https://rest.ensembl.org/
#https://rest.ensembl.org/documentation/info/xref_id

def xrefs_ensembl_id(gene_id):
  
  #pandas series to hold information
    ds_anno = pd.Series(index=['ncbigene'])
  
  #ENSEMBL Server Url
    server = "https://rest.ensembl.org"
  #URL specification to get meta-information of a gene by its id
    ext = f"/xrefs/id/{gene_id}?"
  
  #Request information from URL in json format
    r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
  
  #Check if request wad successful
    if r.ok:
    
    #Get json (dictionary-like structure) from request
        json_anno = r.json()

        #Convert json to pandas dataframe format
        df_anno = pd.DataFrame(json_anno)
        df_anno.set_index('dbname', inplace=True)

        #Save relevant information in pandas series
        if 'EntrezGene' in df_anno.index:
              if 'primary_id' in df_anno.columns:
                    ds_anno['ncbigene'] = str(df_anno.loc['EntrezGene','primary_id'])

    else:
        ds_anno['ncbigene'] = None
 
    return ds_anno

In [13]:
#Request information for each gene using xrefs_ensembl_id() function
df_genes_xref = df_genes_aracore['aracore_ids'].parallel_apply(lambda x: xrefs_ensembl_id(x)) 

SystemError: Python version should be 3.{5, 6, 7}

**Concerning the cell below**:

We write a REQUEST, which contains the URL of the ENSEMBL gene ID, and the Output format, under the following form :
REQUEST(URL, Output format)

This URL is constituted into 2 parts:
- the base of the URL, which refers to the database ENSEMBL
- the keywords, which will be translated into MySQL

These 2 parts of the REQUEST are together converted into the Output format. 

This Output format is JSON, similar to a dictionary in Python.

In [None]:
#Request information for each gene using lookup_ensembl_id() function
df_genes_lookup = df_genes_aracore['aracore_ids'].parallel_apply(lambda x: lookup_ensembl_id(x)) #This may take a few minutes

# V / Merge information into a single mapping table 

In [15]:
#Concat dataframe df_genes_aracore, df_genes_lookup, df_genes_xref
df_genes_aracore = pd.concat([df_genes_aracore, df_genes_lookup, df_genes_xref], axis=1) 
df_genes_aracore

NameError: name 'df_genes_lookup' is not defined

In [16]:
#Seperate descriptive name from uniprot symbol by first appearance of "[", expand in to two columns
#e.g. "Oxygen-evolving enhancer protein 3-2, chloroplastic [Source:UniProtKB/Swiss-Prot;Acc:Q41932]" -> "Oxygen-evolving enhancer protein 3-2, chloroplastic" & Source:UniProtKB/Swiss-Prot;Acc:Q41932]

df_name_uniprot = df_genes_aracore['description'].str.split('[', 1, expand=True,) 
df_name_uniprot

KeyError: 'description'

In [17]:
#Merge with df_genes_aracore
df_genes_aracore = pd.concat([df_genes_aracore, df_name_uniprot], axis=1) 

#rename columns
df_genes_aracore.rename(columns={0: 'gene_name', 1: 'uniprot_id'}, inplace=True)
df_genes_aracore

NameError: name 'df_name_uniprot' is not defined

In [18]:
#Extract Uniprot id by splitting on last appearance of ':', expand columns, but only keep the last column for the uniprot id, the rest is trash
#e.g. Source:UniProtKB/Swiss-Prot;Acc:Q41932] -> "Source:UniProtKB/Swiss-Prot;Acc" & "Q41932]"
df_genes_aracore['uniprot_id'] = df_genes_aracore['uniprot_id'].str.rsplit(':',1,expand=True).iloc[:,-1]

#Remove closing "]" from uniprot id 
#e.g. "Q41932]" -> "Q41932"
df_genes_aracore['uniprot_id'] = df_genes_aracore['uniprot_id'].apply(lambda x: x[:-1] if isinstance(x,str) else None)
df_genes_aracore

KeyError: 'uniprot_id'

In [19]:
df_genes_aracore[df_genes_aracore.isna().sum(axis=1) == 8] #2 Genes have no annotations at all -> Look-up those genes, maybe the Gene ids are wrong!?!

Unnamed: 0,aracore_ids,aracore_name,aracore_annotations


In [20]:
df_genes_aracore[df_genes_aracore['ncbigene'].isna()] # ~28 genes without NCBI Symbol

KeyError: 'ncbigene'

In [21]:
df_genes_aracore[df_genes_aracore['symbol'].isna()].head(100) # 97 genes have no official gene #symbol

KeyError: 'symbol'

In [22]:
df_genes_aracore[df_genes_aracore['uniprot_id'].isna()] #2 genes have no uniprot id (same as above)

KeyError: 'uniprot_id'

In [None]:
#Export final mapping table for manual mapping
df_genes_aracore.to_csv('../data/processed/2021-05-27-genes-mapping-table.csv')