# Install missing modules & load modules

In [None]:
!pip install cobra #install cobra - only required for google colab 



In [None]:
!pip install pandarallel #install panadarallel - only required for google colab 



In [None]:
import pandas as pd
import numpy as np
import cobra
import requests
import sys

In [None]:
from urllib.error import HTTPError

In [None]:
#If this doesn't run install pandarallel in your conda enviroment using conda or pip
#https://github.com/nalepae/pandarallel 

from pandarallel import pandarallel 

pandarallel.initialize(progress_bar=True)

from pandarallel.utils import progress_bars
progress_bars.is_notebook_lab = lambda : True

INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


# Load AraCore Model

In [None]:
#Get file from github
fileName = 'https://raw.githubusercontent.com/ma-blaetke/CBM_C3_C4_Metabolism/master/data/2018-23-05-mb-genC3.sbml'
r = requests.get(fileName)

In [None]:
#Create model
model = cobra.io.read_sbml_model(r.text)

In [None]:
model

0,1
Name,c3_model
Memory address,0x07faf67589fd0
Number of metabolites,413
Number of reactions,572
Number of groups,0
Objective expression,1.0*Ex_Suc - 1.0*Ex_Suc_reverse_fb96e
Compartments,"Chloroplast, Lumen, Cytosol, Mitochondrion, IntermembraneSpace, Peroxisome"


# Create Gene Table for AraCore Model

In [None]:
#Create mapping table
df_genes_aracore = pd.DataFrame(
    {
        "aracore_ids" : [gene_id.id for gene_id in model.genes],
        "aracore_name" : [gene_id.name for gene_id in model.genes],
        "aracore_annotations" : [gene_id.annotation for gene_id in model.genes]
    })

df_genes_aracore.head(25) 

Unnamed: 0,aracore_ids,aracore_name,aracore_annotations
0,AT4G05180,AT4G05180,{}
1,AT2G30570,AT2G30570,{}
2,ATCG00560,ATCG00560,{}
3,AT4G21280,AT4G21280,{}
4,ATCG00300,ATCG00300,{}
5,AT1G06680,AT1G06680,{}
6,ATCG00080,ATCG00080,{}
7,AT2G06520,AT2G06520,{}
8,AT1G79040,AT1G79040,{}
9,AT5G66570,AT5G66570,{}


In [None]:
df_genes_aracore['aracore_ids'].shape[0] #804 genes

804

# Retrieve additional information on genes using the ENSEMBL API

In [None]:
#EXAMPLE REQUEST:

#Retrieve Information on gene symbol, location (chromosome, start and end), uniprot id/descriptive name using the ENSEMBL REST API
#https://rest.ensembl.org/
#https://rest.ensembl.org/documentation/info/lookup

gene_id = 'AT4G05180'

#ENSEMBL Server Url
server = "https://rest.ensembl.org"

#URL specification to get meta-information of a gene by its id
ext = f"/lookup/id/{gene_id}?expand=1"
  
#Request information from URL in json format
r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})

#Check if request wad successful
if r.ok:
    
    #Get json (dictionary-like structure) from request
    json_anno = r.json()

else:

    json_anno = None

json_anno

{'Transcript': [{'Exon': [{'assembly_name': 'TAIR10',
     'db_type': 'core',
     'end': 2673243,
     'id': 'AT4G05180.1.exon1',
     'object_type': 'Exon',
     'seq_region_name': '4',
     'species': 'arabidopsis_thaliana',
     'start': 2672873,
     'strand': -1},
    {'assembly_name': 'TAIR10',
     'db_type': 'core',
     'end': 2672635,
     'id': 'AT4G05180.1.exon2',
     'object_type': 'Exon',
     'seq_region_name': '4',
     'species': 'arabidopsis_thaliana',
     'start': 2672340,
     'strand': -1},
    {'assembly_name': 'TAIR10',
     'db_type': 'core',
     'end': 2672191,
     'id': 'AT4G05180.1.exon3',
     'object_type': 'Exon',
     'seq_region_name': '4',
     'species': 'arabidopsis_thaliana',
     'start': 2671900,
     'strand': -1},
    {'assembly_name': 'TAIR10',
     'db_type': 'core',
     'end': 2671718,
     'id': 'AT4G05180.1.exon4',
     'object_type': 'Exon',
     'seq_region_name': '4',
     'species': 'arabidopsis_thaliana',
     'start': 2671523,
  

In [None]:
#Retrieve Information on gene symbol, location (chromosome, start and end), uniprot id/descriptive name using the ENSEMBL REST API
#https://rest.ensembl.org/
#https://rest.ensembl.org/documentation/info/lookup

def lookup_ensembl_id(gene_id):

  #pandas series to hold information
  ds_anno = pd.Series(index=['symbol','chr','start', 'end', 'description'])
  
  #ENSEMBL Server Url
  server = "https://rest.ensembl.org"
  
  #URL specification to get meta-information of a gene by its id
  ext = f"/lookup/id/{gene_id}?expand=1"
  
  #Request information from URL in json format
  r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
  
  #Check if request wad successful
  if r.ok:
    
    #Get json (dictionary-like structure) from request
    json_anno = r.json()

    #Save relevant information in pandas series
    ds_anno['symbol'] = json_anno.get('display_name', None)
    ds_anno['chr'] = json_anno.get('seq_region_name', None)
    ds_anno['start'] = json_anno.get('start', None)
    ds_anno['end'] = json_anno.get('end', None)
    ds_anno['description'] = json_anno.get('description', None)
  else:
    #Save set pandas series to None if request was not successfull
    ds_anno['symbol'] = None
    ds_anno['chr'] = None
    ds_anno['start'] = None
    ds_anno['end'] = None
    ds_anno['description'] = None

  #return pandas series
  return ds_anno

In [None]:
#EXAMPLE REQUEST

#Retrieve Information on cross references for each gene symbol (NCBI gene id)
#https://rest.ensembl.org/
#https://rest.ensembl.org/documentation/info/xref_id

gene_id = 'AT4G05180'

#ENSEMBL Server Url
server = "https://rest.ensembl.org"

#URL specification to get meta-information of a gene by its id
ext = f"/xrefs/id/{gene_id}?"
  
#Request information from URL in json format
r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})

#Check if request wad successful
if r.ok:
    
    #Get json (dictionary-like structure) from request
    json_anno = r.json()

    #Convert json to pandas dataframe format to easily retrieve information
    df_anno = pd.DataFrame(json_anno)
    df_anno.set_index('dbname', inplace=True)
  
else:
     df_anno = None

df_anno

Unnamed: 0_level_0,info_type,version,info_text,display_id,synonyms,description,primary_id,db_display_name,score,xref_identity,ensembl_identity,xref_start,ensembl_start,ensembl_end,xref_end,cigar_line,evalue
dbname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
NASC_GENE_ID,DIRECT,0,,AT4G05180-TAIR-G,[],photosystem II subunit Q-2,AT4G05180,NASC Gene ID,,,,,,,,,
ArrayExpress,DIRECT,0,,AT4G05180,[],,AT4G05180,Expression Atlas,,,,,,,,,
TAIR_LOCUS,DIRECT,0,,AT4G05180,[],photosystem II subunit Q-2,AT4G05180,TAIR,,,,,,,,,
TAIR_SYMBOL,DIRECT,0,,PSBQ,"[PSBQ-1, PSBQ-2, PSBQA, PSII-Q]",,PSBQ,TAIR Gene Name,,,,,,,,,
UniGene,SEQUENCE_MATCH,0,,At.21853,[],Oxygen-evolving enhancer protein 3-2,At.21853,UniGene,5185.0,100.0,96.0,1.0,1.0,1037.0,1037.0,1037M,
EntrezGene,DEPENDENT,0,,825866,[],,825866,NCBI gene (formerly Entrezgene),,,,,,,,,
KNETMINER_ARA,DEPENDENT,0,,AT4G05180,[],,AT4G05180,KNETMINER_ARA,,,,,,,,,


In [None]:
#Retrieve Information on cross references for each gene symbol (NCBI gene id)
#https://rest.ensembl.org/
#https://rest.ensembl.org/documentation/info/xref_id

def xrefs_ensembl_id(gene_id):
  
  #pandas series to hold information
  ds_anno = pd.Series(index=['ncbigene'])
  
  #ENSEMBL Server Url
  server = "https://rest.ensembl.org"
  #URL specification to get meta-information of a gene by its id
  ext = f"/xrefs/id/{gene_id}?"
  
  #Request information from URL in json format
  r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
  
  #Check if request wad successful
  if r.ok:
    
    #Get json (dictionary-like structure) from request
    json_anno = r.json()

    #Convert json to pandas dataframe format
    df_anno = pd.DataFrame(json_anno)
    df_anno.set_index('dbname', inplace=True)
    
    #Save relevant information in pandas series
    if 'EntrezGene' in df_anno.index:
      if 'primary_id' in df_anno.columns:
        ds_anno['ncbigene'] = str(df_anno.loc['EntrezGene','primary_id'])
  
  else:
     ds_anno['ncbigene'] = None
 
  return ds_anno

In [None]:
#Request information for each gene using xrefs_ensembl_id() function
df_genes_xref = df_genes_aracore['aracore_ids'].parallel_apply(lambda x: xrefs_ensembl_id(x)) 

  
  


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=402), Label(value='0 / 402'))), HB…

In [None]:
#Request information for each gene using lookup_ensembl_id() function
df_genes_lookup = df_genes_aracore['aracore_ids'].parallel_apply(lambda x: lookup_ensembl_id(x)) #This may take a few minutes

  
  


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=402), Label(value='0 / 402'))), HB…

# Merge information into a single mapping table

In [None]:
#Concat dataframe df_genes_aracore, df_genes_lookup, df_genes_xref
df_genes_aracore = pd.concat([df_genes_aracore, df_genes_lookup, df_genes_xref], axis=1) 
df_genes_aracore

Unnamed: 0,aracore_ids,aracore_name,aracore_annotations,symbol,chr,start,end,description,ncbigene
0,AT4G05180,AT4G05180,{},PSBQ2,4,2671523.0,2673243.0,"Oxygen-evolving enhancer protein 3-2, chloropl...",825866
1,AT2G30570,AT2G30570,{},PSBW,2,13019028.0,13020311.0,PSBW [Source:UniProtKB/TrEMBL;Acc:A0A178VQ32],817606
2,ATCG00560,ATCG00560,{},PSBL,Pt,63804.0,63920.0,Photosystem II reaction center protein L [Sour...,844751
3,AT4G21280,AT4G21280,{},PSBQ1,4,11334352.0,11335815.0,"Oxygen-evolving enhancer protein 3-1, chloropl...",827877
4,ATCG00300,ATCG00300,{},PSBZ,Pt,35751.0,35939.0,Photosystem II reaction center protein Z [Sour...,844774
...,...,...,...,...,...,...,...,...,...
799,AT3G27620,AT3G27620,{},AOX1C,3,10229045.0,10230707.0,"Ubiquinol oxidase 1c, mitochondrial [Source:Un...",822384
800,AT5G64210,AT5G64210,{},AOX2,5,25683770.0,25685731.0,"Ubiquinol oxidase 2, mitochondrial [Source:Uni...",836542
801,AT1G32350,AT1G32350,{},AOX3,1,11666886.0,11668690.0,AOX1D [Source:UniProtKB/TrEMBL;Acc:A0A384LFI3],840127
802,AT3G22360,AT3G22360,{},AOX1B,3,7904097.0,7905576.0,"Ubiquinol oxidase 1b, mitochondrial [Source:Un...",821805


In [None]:
#Seperate descriptive name from uniprot symbol by first appearance of "[", expand in to two columns
#e.g. "Oxygen-evolving enhancer protein 3-2, chloroplastic [Source:UniProtKB/Swiss-Prot;Acc:Q41932]" -> "Oxygen-evolving enhancer protein 3-2, chloroplastic" & Source:UniProtKB/Swiss-Prot;Acc:Q41932]

df_name_uniprot = df_genes_aracore['description'].str.split('[', 1, expand=True,) 
df_name_uniprot

Unnamed: 0,0,1
0,"Oxygen-evolving enhancer protein 3-2, chloropl...",Source:UniProtKB/Swiss-Prot;Acc:Q41932]
1,PSBW,Source:UniProtKB/TrEMBL;Acc:A0A178VQ32]
2,Photosystem II reaction center protein L,Source:UniProtKB/Swiss-Prot;Acc:P60129]
3,"Oxygen-evolving enhancer protein 3-1, chloropl...",Source:UniProtKB/Swiss-Prot;Acc:Q9XFT3]
4,Photosystem II reaction center protein Z,Source:UniProtKB/TrEMBL;Acc:A0A1B1W4U3]
...,...,...
799,"Ubiquinol oxidase 1c, mitochondrial",Source:UniProtKB/Swiss-Prot;Acc:O22048]
800,"Ubiquinol oxidase 2, mitochondrial",Source:UniProtKB/Swiss-Prot;Acc:O22049]
801,AOX1D,Source:UniProtKB/TrEMBL;Acc:A0A384LFI3]
802,"Ubiquinol oxidase 1b, mitochondrial",Source:UniProtKB/Swiss-Prot;Acc:O23913]


In [None]:
#Merge with df_genes_aracore
df_genes_aracore = pd.concat([df_genes_aracore, df_name_uniprot], axis=1) 

#rename columns
df_genes_aracore.rename(columns={0: 'gene_name', 1: 'uniprot_id'}, inplace=True)
df_genes_aracore

Unnamed: 0,aracore_ids,aracore_name,aracore_annotations,symbol,chr,start,end,description,ncbigene,0,1
0,AT4G05180,AT4G05180,{},PSBQ2,4,2671523.0,2673243.0,"Oxygen-evolving enhancer protein 3-2, chloropl...",825866,"Oxygen-evolving enhancer protein 3-2, chloropl...",Source:UniProtKB/Swiss-Prot;Acc:Q41932]
1,AT2G30570,AT2G30570,{},PSBW,2,13019028.0,13020311.0,PSBW [Source:UniProtKB/TrEMBL;Acc:A0A178VQ32],817606,PSBW,Source:UniProtKB/TrEMBL;Acc:A0A178VQ32]
2,ATCG00560,ATCG00560,{},PSBL,Pt,63804.0,63920.0,Photosystem II reaction center protein L [Sour...,844751,Photosystem II reaction center protein L,Source:UniProtKB/Swiss-Prot;Acc:P60129]
3,AT4G21280,AT4G21280,{},PSBQ1,4,11334352.0,11335815.0,"Oxygen-evolving enhancer protein 3-1, chloropl...",827877,"Oxygen-evolving enhancer protein 3-1, chloropl...",Source:UniProtKB/Swiss-Prot;Acc:Q9XFT3]
4,ATCG00300,ATCG00300,{},PSBZ,Pt,35751.0,35939.0,Photosystem II reaction center protein Z [Sour...,844774,Photosystem II reaction center protein Z,Source:UniProtKB/TrEMBL;Acc:A0A1B1W4U3]
...,...,...,...,...,...,...,...,...,...,...,...
799,AT3G27620,AT3G27620,{},AOX1C,3,10229045.0,10230707.0,"Ubiquinol oxidase 1c, mitochondrial [Source:Un...",822384,"Ubiquinol oxidase 1c, mitochondrial",Source:UniProtKB/Swiss-Prot;Acc:O22048]
800,AT5G64210,AT5G64210,{},AOX2,5,25683770.0,25685731.0,"Ubiquinol oxidase 2, mitochondrial [Source:Uni...",836542,"Ubiquinol oxidase 2, mitochondrial",Source:UniProtKB/Swiss-Prot;Acc:O22049]
801,AT1G32350,AT1G32350,{},AOX3,1,11666886.0,11668690.0,AOX1D [Source:UniProtKB/TrEMBL;Acc:A0A384LFI3],840127,AOX1D,Source:UniProtKB/TrEMBL;Acc:A0A384LFI3]
802,AT3G22360,AT3G22360,{},AOX1B,3,7904097.0,7905576.0,"Ubiquinol oxidase 1b, mitochondrial [Source:Un...",821805,"Ubiquinol oxidase 1b, mitochondrial",Source:UniProtKB/Swiss-Prot;Acc:O23913]


In [None]:
#Extract Uniprot id by splitting on last appearance of ':', expand columns, but only keep the last column for the uniprot id, the rest is trash
#e.g. Source:UniProtKB/Swiss-Prot;Acc:Q41932] -> "Source:UniProtKB/Swiss-Prot;Acc" & "Q41932]"
df_genes_aracore['uniprot_id'] = df_genes_aracore['uniprot_id'].str.rsplit(':',1,expand=True).iloc[:,-1]

#Remove closing "]" from uniprot id 
#e.g. "Q41932]" -> "Q41932"
df_genes_aracore['uniprot_id'] = df_genes_aracore['uniprot_id'].apply(lambda x: x[:-1] if isinstance(x,str) else None)
df_genes_aracore

In [None]:
df_genes_aracore

Unnamed: 0,aracore_ids,aracore_name,aracore_annotations,symbol,chr,start,end,description,ncbigene,gene_name,uniprot_id
0,AT4G05180,AT4G05180,{},PSBQ2,4,2671523.0,2673243.0,"Oxygen-evolving enhancer protein 3-2, chloropl...",825866,"Oxygen-evolving enhancer protein 3-2, chloropl...",Q41932
1,AT2G30570,AT2G30570,{},PSBW,2,13019028.0,13020311.0,PSBW [Source:UniProtKB/TrEMBL;Acc:A0A178VQ32],817606,PSBW,A0A178VQ32
2,ATCG00560,ATCG00560,{},PSBL,Pt,63804.0,63920.0,Photosystem II reaction center protein L [Sour...,844751,Photosystem II reaction center protein L,P60129
3,AT4G21280,AT4G21280,{},PSBQ1,4,11334352.0,11335815.0,"Oxygen-evolving enhancer protein 3-1, chloropl...",827877,"Oxygen-evolving enhancer protein 3-1, chloropl...",Q9XFT3
4,ATCG00300,ATCG00300,{},PSBZ,Pt,35751.0,35939.0,Photosystem II reaction center protein Z [Sour...,844774,Photosystem II reaction center protein Z,A0A1B1W4U3
...,...,...,...,...,...,...,...,...,...,...,...
799,AT3G27620,AT3G27620,{},AOX1C,3,10229045.0,10230707.0,"Ubiquinol oxidase 1c, mitochondrial [Source:Un...",822384,"Ubiquinol oxidase 1c, mitochondrial",O22048
800,AT5G64210,AT5G64210,{},AOX2,5,25683770.0,25685731.0,"Ubiquinol oxidase 2, mitochondrial [Source:Uni...",836542,"Ubiquinol oxidase 2, mitochondrial",O22049
801,AT1G32350,AT1G32350,{},AOX3,1,11666886.0,11668690.0,AOX1D [Source:UniProtKB/TrEMBL;Acc:A0A384LFI3],840127,AOX1D,A0A384LFI3
802,AT3G22360,AT3G22360,{},AOX1B,3,7904097.0,7905576.0,"Ubiquinol oxidase 1b, mitochondrial [Source:Un...",821805,"Ubiquinol oxidase 1b, mitochondrial",O23913


In [None]:
df_genes_aracore[df_genes_aracore.isna().sum(axis=1) == 8] #2 Genes have no annotations at all -> Look-up those genes, maybe the Gene ids are wrong!?!

Unnamed: 0,aracore_ids,aracore_name,aracore_annotations,symbol,chr,start,end,description,ncbigene,gene_name,uniprot_id
683,AT5G34920,AT5G34920,{},,,,,,,,
721,AT3G600180,AT3G600180,{},,,,,,,,


In [None]:
df_genes_aracore[df_genes_aracore['ncbigene'].isna()] # ~28 genes without NCBI Symbol

Unnamed: 0,aracore_ids,aracore_name,aracore_annotations,symbol,chr,start,end,description,ncbigene,gene_name,uniprot_id
43,ATCG00890,ATCG00890,{},NDHB.1,Pt,94941.0,96795.0,NADH-Ubiquinone/plastoquinone (complex I) prot...,,NADH-Ubiquinone/plastoquinone (complex I) prot...,ATCG00890
58,ATCG01050,ATCG01050,{},NDHD,Pt,115665.0,117185.0,NADH-Ubiquinone/plastoquinone (complex I) prot...,,NADH-Ubiquinone/plastoquinone (complex I) prot...,ATCG01050
181,AT1G17000,AT1G17000,{},ATTPS3,1,5812728.0,5816662.0,trehalose-phosphatase/synthase 3 [Source:TAIR;...,,trehalose-phosphatase/synthase 3,AT1G17000
306,ATMG00510,ATMG00510,{},NAD7,Mt,132071.0,138153.0,NADH dehydrogenase subunit 7 [Source:UniProtKB...,,NADH dehydrogenase subunit 7,G1C2X4
307,ATMG00580,ATMG00580,{},NAD4,Mt,161693.0,169674.0,NADH-ubiquinone oxidoreductase chain 4 [Source...,,NADH-ubiquinone oxidoreductase chain 4,G1C2U9
308,ATMG00060,ATMG00060,{},NAD5C,Mt,20571.0,22086.0,NADH dehydrogenase subunit 5C [Source:TAIR;Acc...,,NADH dehydrogenase subunit 5C,ATMG00060
309,ATMG00070,ATMG00070,{},NAD9,Mt,23663.0,24235.0,NADH dehydrogenase subunit 9 [Source:TAIR;Acc:...,,NADH dehydrogenase subunit 9,ATMG00070
310,ATMG01275,ATMG01275,{},NAD1A,Mt,318004.0,318390.0,NADH dehydrogenase 1A [Source:TAIR;Acc:ATMG01275],,NADH dehydrogenase 1A,ATMG01275
311,ATMG00285,ATMG00285,{},NAD2A,Mt,79740.0,81297.0,NADH dehydrogenase 2A [Source:TAIR;Acc:ATMG00285],,NADH dehydrogenase 2A,ATMG00285
314,ATMG00270,ATMG00270,{},NAD6,Mt,76642.0,77259.0,NADH-ubiquinone oxidoreductase chain 6 [Source...,,NADH-ubiquinone oxidoreductase chain 6,G1C2Y0


In [None]:
df_genes_aracore[df_genes_aracore['symbol'].isna()].head(100) # 97 genes have no official gene #symbol

Unnamed: 0,aracore_ids,aracore_name,aracore_annotations,symbol,chr,start,end,description,ncbigene,gene_name,uniprot_id
30,AT2G26500,AT2G26500,{},,2,11270082.0,11271074.0,At2g26500/T9J22.17 [Source:UniProtKB/TrEMBL;Ac...,817191,At2g26500/T9J22.17,O48717
94,AT5G38410,AT5G38410,{},,5,15376988.0,15378642.0,Ribulose bisphosphate carboxylase small chain ...,833828,Ribulose bisphosphate carboxylase small chain,B3H5S2
98,AT1G56190,AT1G56190,{},,1,21028137.0,21030684.0,Phosphoglycerate kinase [Source:UniProtKB/TrEM...,842072,Phosphoglycerate kinase,A0A178W4Q1
116,AT5G44520,AT5G44520,{},,5,17934287.0,17936554.0,NagB/RpiA/CoA transferase-like superfamily pro...,834479,NagB/RpiA/CoA transferase-like superfamily pro...,AT5G44520
143,AT4G26520,AT4G26520,{},,4,13388290.0,13390862.0,Aldolase superfamily protein [Source:TAIR;Acc:...,828758,Aldolase superfamily protein,AT4G26520
...,...,...,...,...,...,...,...,...,...,...,...
721,AT3G600180,AT3G600180,{},,,,,,,,
732,AT4G13720,AT4G13720,{},,4,7966805.0,7968965.0,Inosine triphosphate pyrophosphatase family pr...,827006,Inosine triphosphate pyrophosphatase family pr...,AT4G13720
735,AT4G23895,AT4G23895,{},,4,12422036.0,12426475.0,Nucleoside diphosphate kinase [Source:UniProtK...,2.74572e+06,Nucleoside diphosphate kinase,F4JPD8
774,AT4G05590,AT4G05590,{},,4,2907012.0,2908631.0,CONTAINS InterPro DOMAIN/s: Uncharacterised pr...,825927,CONTAINS InterPro DOMAIN/s: Uncharacterised pr...,AT4G05590


In [None]:
df_genes_aracore[df_genes_aracore['uniprot_id'].isna()] #2 genes have no uniprot id (same as above)

Unnamed: 0,aracore_ids,aracore_name,aracore_annotations,symbol,chr,start,end,description,ncbigene,gene_name,uniprot_id
683,AT5G34920,AT5G34920,{},,,,,,,,
721,AT3G600180,AT3G600180,{},,,,,,,,


In [None]:
#Export final mapping table for manual mapping
df_genes_aracore.to_csv('drive/MyDrive/2021-05-27-genes-mapping-table.csv')