In [28]:
SAMPLE = True
SAMPLE_SUFFIX = '_sample' # for input and output files

#  Lexical Resource (LR) for output

#### Imports

In [29]:
# interface
from tqdm import tqdm
from dsutils.de.files import dbg

## Files and filesystem
import os
import json
import csv
import glob

## Data management
import numpy as np
import pandas as pd

## DS tools
import re
from types import NoneType
from dsutils.de.files import describe_csv, get_csv_head, xls_to_csv, get_data_path, get_datafile_path
from dsutils.nlp.language import get_lang_code

#### Paths

In [30]:
## input
db_paths = dict(
    eppo_com = get_datafile_path('original/2022-09-02_COMMONnames_EPPO_OQ.csv'),
    eppo_sci = get_datafile_path('original/2022-09-02_SCIENTIFICnames_EPPO_OQ.csv'),
    efsa = get_datafile_path('efsa_keyword_match/FichierMotsClesMagaliLarenaudie.csv'),
    ncbi = get_datafile_path('original/taxa+id_full.txt'), # or taxa+id_microbes+insects.txt
)

glossary_paths = dict(
    efsa_glossary = get_datafile_path('efsa_glossary.csv'),
    eppo_glossary = get_datafile_path('eppo_glossary.csv'),
)

## Output
data_path = get_data_path()
full_eppo_path = os.path.join(data_path, 'eppo_glossary.csv')
output_path = os.path.join(data_path, 'output')
output_gloss_path = os.path.join(data_path, 'output_glossary.csv')
LR_filepath = 'LR.csv'

#### Definitions

Defining functions for preprocessing of original data

In [57]:
def PHT_to_lower_taxon(PHT_code):
    taxon = re.sub(r'(?<!^)(?=[A-Z])', ' ', PHT_code[:-4]).lower()
    return taxon
# print('AcalymmaVittatum-PHT:',PHT_to_lower_taxon('AcalymmaVittatum-PHT')) # test
assert PHT_to_lower_taxon('AcalymmaVittatum-PHT') == 'acalymma vittatum'

def to_pattern(syn):
    r = syn.replace('+', ' ')
    r = r.replace('_', '\\w')
    if r.endswith('%'):
        r = r[:-1] + '\\w*'
    r = r.replace('% ', '\\w* ')
    r = r.replace('%', '\\S*\\s?')
    return r
assert to_pattern(r'chrysom_le%+ray_e+du+concombre') == 'chrysom\wle\w* ray\we du concombre'

def find_value_cross_db(main_df,
                        lookup_df,
                        matching_columns = ('',''),
                        lookup_col = ''
                        ):
    for row_i in len(main_df):
        row = lookup_df[lookup_df[matching_columns[1]] == main_df.iloc[row_i][matching_columns[0]]]
        cell_value = row[0][lookup_col]
        main_df.iloc[row_i][matching_columns[0]] = cell_value
    return main_df
    

#### Parameters

In [32]:
## parameters to read ncbi file
ncbi_sep = '\t'
ncbi_header = ['synonym','taxid','sci_name','tax_path','POS','rank','_', '__']

## which ncbi file to use (specifying extension)
ncbi_ext = '.csv'
db_paths['ncbi'] = db_paths['ncbi'][:-(len(ncbi_ext))]+ncbi_ext

## use a shorter sample file for testing 
if SAMPLE:
    for k in db_paths.keys():
        db_paths[k] = db_paths[k][:-4]+SAMPLE_SUFFIX+db_paths[k][-4:]
    LR_filepath = LR_filepath[:-4]+SAMPLE_SUFFIX+LR_filepath[-4:]

## name of the desired coluimns in the output LR
output_gloss_column_names = ['EPPO_CODE', 'EFSA_PHT', 'NCBI_TAXID', # 3 partial keys to relative dataset
                           'preferred_name', 'synonym', 're',# terms
                           'ds_language', 'match_language'] # languages

In [33]:
## column in each of the original datasets corresponding to the output LR column 
db_names_to_relevant_col_names = dict(
    eppo_com = ['CodeEOPP', None, None,
                'PreferredName', 'CommonName', None,
                ['Language', get_lang_code], None], # [obj (col name), fun] tuple when information is extracted via a function
    eppo_sci = ['CodeEOPP', None, None,
                'PreferredName', 'OtherScientificNames', None,
                None, None],
    efsa = [None, 'Category (pest name)', None,
                ['Category (pest name)', PHT_to_lower_taxon], None,  ['Keywords', to_pattern],
                None, 'Unnamed: 3'],
    ncbi = [None, None, 'taxid',
            'sci_name', 'synonym', None,
            None, None],
    )

#### Load preprocessed NCBI instead

path =get_datafile_path('taxa+id_microbes+insects.csv')

with open(path) as f:
    line = f.readline()
    line = f.readline()

print(line)

describe_csv(path)

get_csv_head(path)

pd.read_csv(path, on_bad_lines='skip')

#### Check Data

In [34]:
db_paths['eppo_com']

'/home/elubrini/GitHub/bio-corpus-translation/data/original/2022-09-02_COMMONnames_EPPO_OQ_sample.csv'

In [35]:
for db_name, path in db_paths.items():
    dbg(len(describe_csv(path).columns))
    if (len(describe_csv(path).columns))<=2:
        dbg(path)
        display(describe_csv(path))
        print('starting csv format standardisation:')
        normalised_df = pd.read_csv(path, names=ncbi_header, index_col=0, on_bad_lines='skip', sep=ncbi_sep, keep_default_na=False).reset_index(level=0)
        print('standardised:')
        display(normalised_df.head())
        new_path = path[:-4]+'.csv'
        
        normalised_df.to_csv(new_path, index=False,)
        db_paths[db_name] = new_path
    display(pd.read_csv(path, keep_default_na=False).head())

[35m[32mmess: [0m5


Unnamed: 0,CommonName,Language,CodeEOPP,PreferredName,AuthorityPreferredName
0,A,,Acode,Asci,Asci
1,B,Bengalese,Bcode,Bsci,Bsci
2,C,Congolese,Ccode,Csci,Csci


[35m[32mmess: [0m5


Unnamed: 0,OtherScientificNames,Authority,CodeEOPP,PreferredName,AuthorityPreferredName
0,Z,authZ,Zcode,Zsci,Zsci


[35m[32m_12: [0m4


Unnamed: 0,Category (pest name),Keywords,Unnamed: 2,Unnamed: 3
0,Apht,A,,Albanian
1,Bpht,B,,
2,Dpht,D,,


[35m[32mprep_eppo_db_i: [0m3


Unnamed: 0,taxid,sci_name,synonym
0,Ataxid,Asci,A
1,Dtaxid,Dsci,D


# Create LR

### (1) Preprocess original datasets

Extract relevant information from each dataset and store it in table form (one `csv` per original dataset)

#### Display examples of input datasets
display(pd.read_csv(db_paths['eppo_sci']).describe())
pd.read_csv(db_paths['eppo_com']).describe()

#### create dictionary of databases
Each database in the dictionary corresponds to one of the original databases, but containing only the relevant columns, renamed to match the output datatabase

In [36]:
db = dict() 

for db_name, columns in db_names_to_relevant_col_names.items(): # loop over db names and respective relevant columns
    db[db_name] = dict()    # new database (in dict of columns format) in db dictionary
    colum_renames_zip = zip(columns, output_gloss_column_names)  # match source-db column names to final-db column names

    ## iterate through columns in source dbs, preprocess them and use them to populate corresponding preprocessed database
    for orig_col, targ_col in colum_renames_zip: # loop over column renaming pairs
        if isinstance(orig_col, str): # if only column name specified (as opposed to col name + function), take column as is
            def fun(x):
                return x
        elif isinstance(orig_col,NoneType): # else, if column must not be selected, continue
            continue
        else:    # else, if a preprocessing function has been specified, together with column name, use it when extracting column
            fun = orig_col[1]
            orig_col = orig_col[0]
            
        # adjust reading parameters to ncbi format, if needed 
        try:
            ds_col = pd.read_csv(db_paths[db_name], on_bad_lines='skip', keep_default_na=False)[orig_col]
        except:
            ds_col = pd.read_csv(db_paths[db_name], names=ncbi_header, index_col=0, on_bad_lines='skip', sep=ncbi_sep, keep_default_na=False).reset_index(level=0)[orig_col]
        
        ## add column to the corresponding preprocessed db
        db[db_name][targ_col] = list(map(fun, ds_col)) # new_db col = preprocessed old_db col
    
    ## save each db
    pd.DataFrame.from_dict(db[db_name]).to_csv(os.path.join(output_path, db_name+'_table.csv'), index=False)


### (2) Join Preprocessed Datasets

In [37]:
## Load previously saved datasets 
prep_dbs = dict()

for db_name in db_names_to_relevant_col_names.keys():
    prep_dbs[db_name] = pd.read_csv(os.path.join(output_path, db_name + '_table.csv'), on_bad_lines='skip', keep_default_na=False)

In [38]:
## check they loaded correctly (e.g. ncbi)
prep_dbs['ncbi'].head()

Unnamed: 0,NCBI_TAXID,preferred_name,synonym
0,Ataxid,Asci,A
1,Dtaxid,Dsci,D


#### A concatenated version (legacy code; to be omitted in future version)

In [39]:
len(list(prep_dbs.items()))

4

In [40]:
list(prep_dbs.items())[0][1]

Unnamed: 0,EPPO_CODE,preferred_name,synonym,ds_language
0,Acode,Asci,A,
1,Bcode,Bsci,B,bn
2,Ccode,Csci,C,kg


In [41]:
conc_list = list(prep_dbs.values())
conc_list[-1] = conc_list[-1].head()

In [42]:
concat_df = pd.concat(
    conc_list,
    ignore_index=True)
concat_df.to_csv('concatenated_glossary.csv')
len(concat_df)

9

In [43]:
concat_df.head()

Unnamed: 0,EPPO_CODE,preferred_name,synonym,ds_language,EFSA_PHT,re,match_language,NCBI_TAXID
0,Acode,Asci,A,,,,,
1,Bcode,Bsci,B,bn,,,,
2,Ccode,Csci,C,kg,,,,
3,Zcode,Zsci,Z,,,,,
4,,,,,Apht,A,Albanian,


#### Create empty DF with col names

In [44]:
empty_db = pd.DataFrame(columns=output_gloss_column_names)
display(empty_db)

Unnamed: 0,EPPO_CODE,EFSA_PHT,NCBI_TAXID,preferred_name,synonym,re,ds_language,match_language


In [45]:
## prep ncbi db entities will be used to complete other database rows 
prep_ncbi_db = prep_dbs['ncbi']

df1 = pd.DataFrame([[3,4],[5,6]], columns=['a','b'])
df2 = pd.DataFrame([[5,6],[5,6]], columns=['a','c'])
display(df1)
pd.concat([df1,df2], ignore_index=True)

#### Add EPPO info to final DB

In [46]:
db_names = list(str(x) for x in prep_dbs.keys()) # names of EPPO databases

In [47]:
# copy into a new db to which eppo information will be added 
db_w_eppo_data = empty_db.copy()

## [EPPO-ONLY STEP] open both eppo (sci and com) dbs and concatenate them
prep_eppo_dbs = [prep_dbs[db_name] for db_name in db_names if 'eppo' in db_name]
prep_eppo_db = pd.concat(prep_eppo_dbs, ignore_index=True).fillna('')
display(prep_eppo_db.head())
len(prep_eppo_db)

Unnamed: 0,EPPO_CODE,preferred_name,synonym,ds_language
0,Acode,Asci,A,
1,Bcode,Bsci,B,bn
2,Ccode,Csci,C,kg
3,Zcode,Zsci,Z,


4

In [48]:
## loop over syns in EPPO db
for prep_eppo_db_i in tqdm(range(len(prep_eppo_db))):
    syn = str(prep_eppo_db['synonym'][prep_eppo_db_i]) 
    lang = prep_eppo_db['ds_language'][prep_eppo_db_i]
    key = prep_eppo_db['EPPO_CODE'][prep_eppo_db_i]
    sci_name = prep_eppo_db['preferred_name'][prep_eppo_db_i]
    
    ## rows (entities) in NCBI that have a matching synonym or scientific name
    prep_ncbi_syn_rows = prep_ncbi_db[prep_ncbi_db['synonym']==syn]
    prep_ncbi_sci_rows = prep_ncbi_db[(prep_ncbi_db['preferred_name']==sci_name) | (prep_ncbi_db['preferred_name']=="'"+sci_name+"'")]
    prep_ncbi_matching_rows = pd.concat([prep_ncbi_syn_rows, prep_ncbi_sci_rows], ignore_index=True)
    
    ## list taxids and sci names of matching entities
    taxid_col = (prep_ncbi_matching_rows['NCBI_TAXID'])
    sci_name_col = prep_ncbi_matching_rows['preferred_name']
    
    ## if there's at least one match in NCBI, take the first one 
    if len(prep_ncbi_matching_rows) != 0:
        taxid = taxid_col[0]
        sci_name = sci_name_col[0]
    else:
        taxid = ''
        sci_name = prep_eppo_db['preferred_name'][prep_eppo_db_i]
    
    ## add collected info from NCBI and add it to row
    new_eppo_row = pd.DataFrame([[taxid, sci_name, syn, lang, key]], columns=['NCBI_TAXID', 'preferred_name', 'synonym', 'ds_language', 'EPPO_CODE'])
    
    ## add row to 
    db_w_eppo_data = pd.concat([db_w_eppo_data, new_eppo_row], ignore_index=True).fillna('')

100%|██████████| 4/4 [00:00<00:00, 182.58it/s]


In [49]:
db_w_eppo_data

Unnamed: 0,EPPO_CODE,EFSA_PHT,NCBI_TAXID,preferred_name,synonym,re,ds_language,match_language
0,Acode,,Ataxid,Asci,A,,,
1,Bcode,,,Bsci,B,,bn,
2,Ccode,,,Csci,C,,kg,
3,Zcode,,,Zsci,Z,,,


In [50]:
db_w_eppo_data.to_csv('output.csv')

for i in range(10):
    print('i is:'+str(i))
    for j in range(10):
        print(j)
        if j%5 == 0 and j>0:
            print("break")
            break

#### Add EFSA info to final DB

EFSA is made of reg expression, while EPPO is made of synonyms.
In orfer to add EFSA information, to the database that is already populated with EPPO data, the regexes will be applied to the synonyms. If a match is found, the EFSA data will complete the previously EPPO-only row, else a new row will be created.

In [51]:
# copy into a new db to which eppo information will be added 
db_w_eppo_and_efsa_data = db_w_eppo_data.copy()
display(db_w_eppo_and_efsa_data)
## concatenate all dbs with matching name (normally only one in the case of EFSA)
prep_efsa_dbs = [prep_dbs[db_name] for db_name in db_names if 'efsa' in db_name]
prep_efsa_db = pd.concat(prep_efsa_dbs, ignore_index=True).fillna('')
display(prep_efsa_db.head())
len(prep_efsa_db)

Unnamed: 0,EPPO_CODE,EFSA_PHT,NCBI_TAXID,preferred_name,synonym,re,ds_language,match_language
0,Acode,,Ataxid,Asci,A,,,
1,Bcode,,,Bsci,B,,bn,
2,Ccode,,,Csci,C,,kg,
3,Zcode,,,Zsci,Z,,,


Unnamed: 0,EFSA_PHT,preferred_name,re,match_language
0,Apht,,A,Albanian
1,Bpht,,B,
2,Dpht,,D,


3

In [56]:
## loop over regexes in EFSA db
for prep_efsa_db_i in tqdm(range(len(prep_efsa_db))):
    match_found = False
    
    ## GET EFSA INFO
    
    pattern = prep_efsa_db['re'][prep_efsa_db_i]
    sci_name = prep_efsa_db['preferred_name'][prep_efsa_db_i]
    efsa_php = prep_efsa_db['EFSA_PHT'][prep_efsa_db_i]
    
    ## GET NCBI INFO
    efsa_taxid = prep_efsa_db['NCBI_TAXID'][prep_efsa_db_i]
    dbg(efsa_taxid, pattern)
    if len(efsa_taxid) == 0: 
        ## rows (entities) in NCBI that have a matching synonym or scientific name with efsa regex
        prep_ncbi_syn_row = pd.DataFrame(columns=prep_ncbi_db.columns)
        print('ITERATION BEGINS')
        for i, syn in enumerate(prep_ncbi_db['synonym']):
            dbg(pattern, syn)
            if re.fullmatch(str(pattern), str(syn)):
                prep_ncbi_syn_row = prep_ncbi_db.iloc[i]
                print('MATCH')
                break # only takes first match to minimise computation timeprint(type(sci_name))
        prep_ncbi_sci_rows = prep_ncbi_db.loc[(prep_ncbi_db['preferred_name']==str(sci_name)) | (prep_ncbi_db['preferred_name']=="'"+str(sci_name)+"'")]
        prep_ncbi_matching_rows = pd.concat([prep_ncbi_syn_row, prep_ncbi_sci_rows], ignore_index=True).fillna('')
        
        ## list taxids and sci names of matching entities
        taxid_col = (prep_ncbi_matching_rows['NCBI_TAXID'])
        sci_name_col = prep_ncbi_matching_rows['preferred_name']
        
        ## if there's at least one match in NCBI, take the first one 
        if len(prep_ncbi_matching_rows) != 0:
            taxid = taxid_col[0]
            sci_name = sci_name_col[0]
        else:
            taxid = ''
            sci_name = prep_efsa_db['preferred_name'][prep_efsa_db_i]
    
    ## INSERT DATA INTO FINAL DB
    
    ## iterate over final db rows
    for db_w_eppo_and_efsa_data_i in range(len(db_w_eppo_and_efsa_data)):
        syn = db_w_eppo_and_efsa_data['synonym'][db_w_eppo_and_efsa_data_i]
        eppo_taxid = db_w_eppo_and_efsa_data['NCBI_TAXID'][db_w_eppo_and_efsa_data_i]
        
        ## matching conditions
        match_1 = re.fullmatch(str(pattern), str(syn))
        match_2 = eppo_taxid and taxid == eppo_taxid
    
        ## if re matches a syn, add: re, efsa key, language, taxid
        if match_1 or match_2 :
            db_w_eppo_and_efsa_data['re'][db_w_eppo_and_efsa_data_i] = pattern
            db_w_eppo_and_efsa_data['EFSA_PHT'][db_w_eppo_and_efsa_data_i] = prep_efsa_db['EFSA_PHT'][prep_efsa_db_i]
            db_w_eppo_and_efsa_data['match_language'][db_w_eppo_and_efsa_data_i] = prep_efsa_db['match_language'][prep_efsa_db_i]
            dbg(syn)
            dbg(taxid)
            db_w_eppo_and_efsa_data['NCBI_TAXID'] = taxid
            match_found = True
        
    ## else, if no matches, add new line = re, syn, key, language, taxid
    if not match_found:
        pattern = prep_efsa_db['re'][prep_efsa_db_i]
        syn = ''
        key = prep_efsa_db['EFSA_PHT'][prep_efsa_db_i]
        lang = prep_efsa_db['match_language'][prep_efsa_db_i]
        
        new_efsa_row = pd.DataFrame([[pattern, syn, lang, key, taxid]], columns=['re', 'synonym', 'match_language', 'EFSA_PHT', 'NCBI_TAXID'])
        display(new_efsa_row)
        db_w_eppo_and_efsa_data = pd.concat([db_w_eppo_and_efsa_data, new_efsa_row], ignore_index=True).fillna('')

db_w_eppo_and_efsa_data.head(10)

  0%|          | 0/3 [00:00<?, ?it/s]


KeyError: 'NCBI_TAXID'

#### add NCBI info to incomplete EFSA/EPPO lines

In [54]:
final_db = db_w_eppo_and_efsa_data.copy()

In [55]:
## Iterate over final db lines 
for final_db_i in tqdm(range(len(final_db))):
    pattern = final_db.at[final_db_i,'re']
    # check if row has only efsa info
    if pattern and pd.isna(final_db.at[final_db_i,'NCBI_TAXID']):
        # iterate through NCBI to find an item that corresponds to pattern
        for prep_ncbi_db_i in range(len(prep_ncbi_db)):
            syn = prep_ncbi_db.at[prep_ncbi_db_i,'synonym']
            sci_name = prep_ncbi_db.at[prep_ncbi_db_i,'preferred_name']
            taxid = prep_ncbi_db.at[prep_ncbi_db_i,'NCBI_TAXID']
            if re.match(str(pattern), str(syn)):
                syn_already_there = final_db.at[final_db_i,'synonym'] == syn
                dbg(syn,'MATCH:'+pattern+'\nSYN WAS ALREADY THERE'+str(syn_already_there))
                final_db.at[final_db_i,'synonym'] = syn
                if not syn_already_there:
                    dbg(final_db.at[final_db_i,'synonym'], syn)
            
            if re.match(str(pattern), str(syn)) or re.match(str(pattern), str(sci_name)):
                taxid_already_there = final_db.at[final_db_i,'NCBI_TAXID'] == taxid
                pref_name_already_there = final_db.at[final_db_i,'preferred_name'] == sci_name
                dbg(sci_name,'MATCH:'+pattern+'\nTAXID WAS ALREADY THERE'+str(taxid_already_there)+'\npreferred_name WAS ALREADY THERE'+str(pref_name_already_there))
                
                if not pref_name_already_there:
                    dbg(final_db.at[final_db_i,'preferred_name'], sci_name)
                
                if not taxid_already_there:
                    dbg(final_db.at[final_db_i,'NCBI_TAXID'], taxid)
                
                final_db.at[final_db_i,'NCBI_TAXID'] = taxid
                final_db.at[final_db_i,'preferred_name'] = sci_name
                break
        #print('checked all NCBI for '+pattern)
display(final_db.head())
len(final_db)

100%|██████████| 5/5 [00:00<00:00, 7251.56it/s]


Unnamed: 0,EPPO_CODE,EFSA_PHT,NCBI_TAXID,preferred_name,synonym,re,ds_language,match_language
0,Acode,Apht,,Asci,A,A,,Albanian
1,Bcode,Bpht,,Bsci,B,B,bn,
2,Ccode,,,Csci,C,,kg,
3,Zcode,,,Zsci,Z,,,
4,,Dpht,,,,D,,


5

column_names = ['EPPO_CODE','EFSA_PHT','NCBI_TAXID','name']
column_names = ['a','b','c','name']
df = pd.DataFrame([['x','y',None,'tizio'],[None,'y','z','caio'],['w',None,None,'sempronio']], columns=column_names)
df_out = pd.DataFrame([['x','y','z','tizio'],['x','y','z','caio'],['w',None,None,'sempronio']], columns=column_names)
print('INPUT:')
display(df)
print('The output I want:')
display(df_out)
print('Output of my attempt:')
df.groupby(['a','b','c'], as_index=False).agg({'a' : 'max', 'b' : 'max', 'c' : 'max'})

#### Copy complementary info to all terms representing the same entity

In [None]:
column_names = ['EPPO_CODE','EFSA_PHT','NCBI_TAXID','name']
#df_sample = pd.DataFrame([['x','y',None,'tizio'],[None,'y','z','caio'],['w',None,None,'sempronio']], columns=column_names)
#display(df_sample)

## Copy NCBI info to all EFSA terms representing the same entity (same EFSA PHT), etc
cc = ['EPPO_CODE','EFSA_PHT','NCBI_TAXID','preferred_name']
indices = list(range(len(cc)))

for _ in indices:
    #for _ in indices:
    for col_i in indices:
        empty_col_indices = indices.copy()
        empty_col_indices.remove(col_i)
        
        ref_col_name = cc[col_i]
        blank_col_names = [cc[x] for x in empty_col_indices] 
    #dbg(ref_col_name, '\nNEW ITERATION: ')
        ## Iterate over final db lines 
        for final_db_i in tqdm(range(len(final_db))):
    #print('extracting info from row:', final_db_i)
            ## check if row has a value in the reference column 
            ref_col_value = final_db.at[final_db_i,ref_col_name]
    #dbg(ref_col_value)
            if ref_col_value:
                ## store id values from other columns
                id_values = {col_name:final_db.at[final_db_i,col_name] for col_name in blank_col_names}
    #dbg(id_values, ref_col_value)
    #print('all rows with',ref_col_value,'will be completed with',list(id_values.values()))
                for edit_db_i in (range(len(final_db))):
                    for col_name in blank_col_names:
    #dbg(col_name, 'column')
    #dbg(edit_db_i, 'row')
                        if not final_db.at[edit_db_i,col_name] and final_db.at[edit_db_i,ref_col_name]==ref_col_value:
    #print('EDITING')
    #print(col_name,'none, replace with',id_values[col_name])
                            final_db.at[edit_db_i,col_name] = id_values[col_name]
final_db

100%|██████████| 39/39 [00:00<00:00, 272.73it/s]
100%|██████████| 39/39 [00:00<00:00, 1053.88it/s]
100%|██████████| 39/39 [00:00<00:00, 6583.40it/s]
100%|██████████| 39/39 [00:00<00:00, 247.75it/s]
100%|██████████| 39/39 [00:00<00:00, 267.63it/s]
100%|██████████| 39/39 [00:00<00:00, 1013.82it/s]
100%|██████████| 39/39 [00:00<00:00, 5888.54it/s]
100%|██████████| 39/39 [00:00<00:00, 269.35it/s]
100%|██████████| 39/39 [00:00<00:00, 266.26it/s]
100%|██████████| 39/39 [00:00<00:00, 1034.65it/s]
100%|██████████| 39/39 [00:00<00:00, 5276.02it/s]
100%|██████████| 39/39 [00:00<00:00, 267.54it/s]
100%|██████████| 39/39 [00:00<00:00, 260.90it/s]
100%|██████████| 39/39 [00:00<00:00, 1059.19it/s]
100%|██████████| 39/39 [00:00<00:00, 6554.13it/s]
100%|██████████| 39/39 [00:00<00:00, 267.45it/s]


Unnamed: 0,EPPO_CODE,EFSA_PHT,NCBI_TAXID,preferred_name,synonym,re,ds_language,match_language
0,LIBEAF,,TEEEEEEEST TAXID,'Candidatus Liberibacter africanus',greening of citrus,,en,
1,LIBEAF,,TEEEEEEEST TAXID,'Candidatus Liberibacter africanus',greening des agrumes,,fr,
2,LIBEAF,,TEEEEEEEST TAXID,'Candidatus Liberibacter africanus',virescence des agrumes,,fr,
3,LIBEAF,,TEEEEEEEST TAXID,'Candidatus Liberibacter africanus',enverdecimiento de los cítricos,,es,
4,LIBEAM,,,'Candidatus Liberibacter americanus',Brazilian citrus greening,,en,
5,LIBEAS,,,'Candidatus Liberibacter asiaticus',blotchy mottle disease of citrus,,en,
6,LIBEAS,,,'Candidatus Liberibacter asiaticus',citrus dieback,,en,
7,LIBEAS,,,'Candidatus Liberibacter asiaticus',decline of citrus,,en,
8,LIBEAS,,,'Candidatus Liberibacter asiaticus',greening of citrus,,en,
9,LIBEAS,,,'Candidatus Liberibacter asiaticus',huanglongbing,,en,


In [None]:
final_db.to_csv(LR_filepath)

In [None]:
db = pd.read_csv(LR_filepath, keep_default_na=False)

In [None]:
sci_name = 'Ralstonia pseudosolanacearum'

prep_ncbi_db[(prep_ncbi_db['synonym']=='bacteria') | (prep_ncbi_db['synonym']=="'"+sci_name+"'")]

Unnamed: 0,NCBI_TAXID,preferred_name,synonym
1,ncbi:2,Bacteria,bacteria
