# Creating genes mapping table 

## I / Import the resources : AraCore model and ModelSEED and BiGG tables 

### 1) Install missing modules and load modules 

In [2]:
import pandas as pd
import numpy as np
import cobra
import requests

### 2) Load AraCore Model 

In [3]:
#Get file from github
fileName = 'https://raw.githubusercontent.com/ma-blaetke/CBM_C3_C4_Metabolism/master/data/2018-23-05-mb-genC3.sbml'
r = requests.get(fileName)

In [4]:
#Create model
model = cobra.io.read_sbml_model(r.text)
model

0,1
Name,c3_model
Memory address,0x07fbe597fb4c0
Number of metabolites,413
Number of reactions,572
Number of groups,0
Objective expression,1.0*Ex_Suc - 1.0*Ex_Suc_reverse_fb96e
Compartments,"Chloroplast, Lumen, Cytosol, Mitochondrion, IntermembraneSpace, Peroxisome"


### 3) Correct gene naming in AraCore Model according to BiGG naming conventions 

###  4) Correct gene naming in AraCore Model according to ModelSEED naming conventions

### 5) Create Gene Table for AraCore Model 

In [7]:
df_genes_aracore = pd.DataFrame(
    {
        "aracore_ids":[x.id for x in model.genes],
        "aracore_name":[x.name for x in model.genes],
        "aracore_annotations":[x.annotation for x in model.genes]
    })
df_genes_aracore

Unnamed: 0,aracore_ids,aracore_name,aracore_annotations
0,AT4G05180,AT4G05180,{}
1,AT2G30570,AT2G30570,{}
2,ATCG00560,ATCG00560,{}
3,AT4G21280,AT4G21280,{}
4,ATCG00300,ATCG00300,{}
...,...,...,...
799,AT3G27620,AT3G27620,{}
800,AT5G64210,AT5G64210,{}
801,AT1G32350,AT1G32350,{}
802,AT3G22360,AT3G22360,{}


In [12]:
df_genes_aracore["aracore_ids"].shape[0] # indicates the number of genes

804

In [13]:
df_genes_aracore.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 804 entries, 0 to 803
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   aracore_ids          804 non-null    object
 1   aracore_name         804 non-null    object
 2   aracore_annotations  804 non-null    object
dtypes: object(3)
memory usage: 19.0+ KB


When we look at the metabolites or the reactions in the BiGG database, for example, we can notice that there are the correspondong genes, as well as their ID in the NCBI database. 
=> Check in the NCBI database : 
We have : 
- the gene ID
- summary
- the genomic context
- the genomic regions, transcripts, and products
- bibliography
- the general protein information
- NCBI preferenced sequence
- related sequence
- additional links
=> 

In [14]:
#EXAMPLE REQUEST:

#Retrieve Information on gene symbol, location (chromosome, start and end), uniprot id/descriptive name using the ENSEMBL REST API
#https://rest.ensembl.org/
#https://rest.ensembl.org/documentation/info/lookup

gene_id = 'AT4G05180'

#ENSEMBL Server Url
server = "https://rest.ensembl.org"

#URL specification to get meta-information of a gene by its id
ext = f"/lookup/id/{gene_id}?expand=1"
  
#Request information from URL in json format
r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})

#Check if request was successful
if r.ok:
    
    #Get json (dictionary-like structure) from request
    json_anno = r.json()

else:

    json_anno = None

json_anno

{'end': 2673243,
 'logic_name': 'araport11',
 'start': 2671523,
 'db_type': 'core',
 'assembly_name': 'TAIR10',
 'id': 'AT4G05180',
 'seq_region_name': '4',
 'biotype': 'protein_coding',
 'Transcript': [{'biotype': 'protein_coding',
   'Exon': [{'id': 'AT4G05180.1.exon1',
     'assembly_name': 'TAIR10',
     'db_type': 'core',
     'seq_region_name': '4',
     'end': 2673243,
     'start': 2672873,
     'species': 'arabidopsis_thaliana',
     'object_type': 'Exon',
     'strand': -1},
    {'strand': -1,
     'object_type': 'Exon',
     'species': 'arabidopsis_thaliana',
     'start': 2672340,
     'end': 2672635,
     'seq_region_name': '4',
     'assembly_name': 'TAIR10',
     'id': 'AT4G05180.1.exon2',
     'db_type': 'core'},
    {'object_type': 'Exon',
     'strand': -1,
     'species': 'arabidopsis_thaliana',
     'end': 2672191,
     'start': 2671900,
     'assembly_name': 'TAIR10',
     'id': 'AT4G05180.1.exon3',
     'db_type': 'core',
     'seq_region_name': '4'},
    {'strand

In [15]:
#Retrieve Information on gene symbol, location (chromosome, start and end), uniprot id/descriptive name using the ENSEMBL REST API
#https://rest.ensembl.org/
#https://rest.ensembl.org/documentation/info/lookup

def lookup_ensembl_id(gene_id):

  #pandas series to hold information
  ds_anno = pd.Series(index=['symbol','chr','start', 'end', 'description'])
  
  #ENSEMBL Server Url
  server = "https://rest.ensembl.org"
  
  #URL specification to get meta-information of a gene by its id
  ext = f"/lookup/id/{gene_id}?expand=1"
  
  #Request information from URL in json format
  r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
  
  #Check if request wad successful
if r.ok:
    
    #Get json (dictionary-like structure) from request
    json_anno = r.json()

    #Save relevant information in pandas series
    ds_anno['symbol'] = json_anno.get('display_name', None)
    ds_anno['chr'] = json_anno.get('seq_region_name', None)
    ds_anno['start'] = json_anno.get('start', None)
    ds_anno['end'] = json_anno.get('end', None)
    ds_anno['description'] = json_anno.get('description', None)
else:
    #Save set pandas series to None if request was not successfull
    ds_anno['symbol'] = None
    ds_anno['chr'] = None
    ds_anno['start'] = None
    ds_anno['end'] = None
    ds_anno['description'] = None

  #return pandas series
  return ds_anno

In [16]:
#EXAMPLE REQUEST

#Retrieve Information on cross references for each gene symbol (NCBI gene id)
#https://rest.ensembl.org/
#https://rest.ensembl.org/documentation/info/xref_id

gene_id = 'AT4G05180'

#ENSEMBL Server Url
server = "https://rest.ensembl.org"

#URL specification to get meta-information of a gene by its id
ext = f"/xrefs/id/{gene_id}?"
  
#Request information from URL in json format
r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})

#Check if request wad successful
if r.ok:
    
    #Get json (dictionary-like structure) from request
    json_anno = r.json()

    #Convert json to pandas dataframe format to easily retrieve information
    df_anno = pd.DataFrame(json_anno)
    df_anno.set_index('dbname', inplace=True)
  
else:
     df_anno = None

df_anno

Unnamed: 0_level_0,db_display_name,primary_id,description,display_id,synonyms,version,info_text,info_type,ensembl_end,xref_start,ensembl_start,ensembl_identity,score,xref_identity,cigar_line,evalue,xref_end
dbname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
NASC_GENE_ID,NASC Gene ID,AT4G05180,photosystem II subunit Q-2,AT4G05180-TAIR-G,[],0,,DIRECT,,,,,,,,,
ArrayExpress,Expression Atlas,AT4G05180,,AT4G05180,[],0,,DIRECT,,,,,,,,,
TAIR_LOCUS,TAIR,AT4G05180,photosystem II subunit Q-2,AT4G05180,[],0,,DIRECT,,,,,,,,,
TAIR_SYMBOL,TAIR Gene Name,PSBQ,,PSBQ,"[PSBQ-1, PSBQ-2, PSBQA, PSII-Q]",0,,DIRECT,,,,,,,,,
UniGene,UniGene,At.21853,Oxygen-evolving enhancer protein 3-2,At.21853,[],0,,SEQUENCE_MATCH,1037.0,1.0,1.0,96.0,5185.0,100.0,1037M,,1037.0
EntrezGene,NCBI gene (formerly Entrezgene),825866,,825866,[],0,,DEPENDENT,,,,,,,,,
KNETMINER_ARA,KNETMINER_ARA,AT4G05180,,AT4G05180,[],0,,DEPENDENT,,,,,,,,,


In [17]:
#Retrieve Information on cross references for each gene symbol (NCBI gene id)
#https://rest.ensembl.org/
#https://rest.ensembl.org/documentation/info/xref_id

def xrefs_ensembl_id(gene_id):
  
  #pandas series to hold information
  ds_anno = pd.Series(index=['ncbigene'])
  
  #ENSEMBL Server Url
  server = "https://rest.ensembl.org"
  #URL specification to get meta-information of a gene by its id
  ext = f"/xrefs/id/{gene_id}?"
  
  #Request information from URL in json format
  r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
  
  #Check if request wad successful
  if r.ok:
    
    #Get json (dictionary-like structure) from request
    json_anno = r.json()

    #Convert json to pandas dataframe format
    df_anno = pd.DataFrame(json_anno)
    df_anno.set_index('dbname', inplace=True)
    
    #Save relevant information in pandas series
    if 'EntrezGene' in df_anno.index:
      if 'primary_id' in df_anno.columns:
        ds_anno['ncbigene'] = str(df_anno.loc['EntrezGene','primary_id'])
  
  else:
     ds_anno['ncbigene'] = None
 
  return ds_anno

In [18]:
#Request information for each gene using xrefs_ensembl_id() function
df_genes_xref = df_genes_aracore['aracore_ids'].parallel_apply(lambda x: xrefs_ensembl_id(x)) 

AttributeError: 'Series' object has no attribute 'parallel_apply'

### 6) Import BiGG genes file 

### 7) Import ModelSEED genes file 

## II / Extract the annotations from the AraCore model 

## III / Extract the id from the annotations we extracted previously (2nd step) 

## IV / Merge AraCore table with BiGG one 