In [71]:
SAMPLE = False

#  Lexical Resource (LR) for Annotation

#### Imports

In [72]:
# interface
from tqdm import tqdm
from dsutils.de.files import dbg

## Files and filesystem
import os
import json
import csv
import glob

## Data management
import numpy as np
import pandas as pd

## DS tools
import re
from types import NoneType
from dsutils.de.files import describe_csv, get_csv_head, xls_to_csv, get_data_path, get_datafile_path
from dsutils.nlp.language import get_lang_code

#### Paths

In [73]:
## input
db_paths = dict(
    eppo_com = get_datafile_path('original/2022-09-02_COMMONnames_EPPO_OQ.csv'),
    eppo_sci = get_datafile_path('original/2022-09-02_SCIENTIFICnames_EPPO_OQ.csv'),
    efsa = get_datafile_path('EFSA-keyword-match/FichierMotsClesMagaliLarenaudie.csv'),
    ncbi = get_datafile_path('taxa+id_full.txt'), # or taxa+id_microbes+insects.txt
)

glossary_paths = dict(
    efsa_glossary = get_datafile_path('efsa_glossary.csv'),
    eppo_glossary = get_datafile_path('eppo_glossary.csv'),
)

## Output
data_path = get_data_path()
full_eppo_path = os.path.join(data_path, 'eppo_glossary.csv')
annotation_path = os.path.join(data_path, 'LRs_for_annotation')
anno_gloss_path = os.path.join(data_path, 'annotation_glossary.csv')
LR_filepath = 'LR.csv'

#### Definitions

Defining functions for preprocessing of original data

In [74]:
def PHT_to_lower_taxon(PHT_code):
    taxon = re.sub(r'(?<!^)(?=[A-Z])', ' ', PHT_code[:-4]).lower()
    return taxon
# print('AcalymmaVittatum-PHT:',PHT_to_lower_taxon('AcalymmaVittatum-PHT')) # test
assert PHT_to_lower_taxon('AcalymmaVittatum-PHT') == 'acalymma vittatum'

def to_pattern(syn):
    r = syn.replace('+', ' ')
    r = r.replace('_', '\\w')
    if r.endswith('%'):
        r = r[:-1] + '\\w*'
    r = r.replace('% ', '\\w* ')
    r = r.replace('%', '\\S*\\s?')
    return r
assert to_pattern(r'chrysom_le%+ray_e+du+concombre') == 'chrysom\wle\w* ray\we du concombre'

#### Parameters

In [75]:
ncbi_sep = '\t'
ncbi_header = ['synonym','taxid','sci_name','tax_path','POS','rank','_', '__']
ncbi_ext = '.csv'

db_paths['ncbi'] = db_paths['ncbi'][:-4]+ncbi_ext
if SAMPLE:
    for k in db_paths.keys():
        db_paths[k] = db_paths[k][:-4]+' copy'+db_paths[k][-4:]
    LR_filepath = LR_filepath[:-4]+'_sample'+LR_filepath[-4:]

## name of the desired coluimns in the LR
anno_gloss_column_names = ['EPPO_CODE', 'EFSA_PHT', 'NCBI_TAXID', # 3 partial keys to relative dataset
                           'preferred_name', 'synonym', 're',# terms
                           'ds_language', 'match_language'] # languages

In [76]:
## column in each of the original datasets corresponding to the output LR column 
cc = dict(
    eppo_com = ['CodeEOPP', None, None,
                'PreferredName', 'CommonName', None,
                ['Language', get_lang_code], None], # [obj (col name), fun] tuple when information is extracted via a function
    eppo_sci = ['CodeEOPP', None, None,
                'PreferredName', 'OtherScientificNames', None,
                None, None],
    efsa = [None, 'Category (pest name)', None,
                ['Category (pest name)', PHT_to_lower_taxon], None,  ['Keywords', to_pattern],
                None, 'Unnamed: 3'],
    ncbi = [None, None, 'taxid',
            'sci_name', 'synonym', None,
            None, None],
    )

In [77]:
path =get_datafile_path('taxa+id_microbes+insects.csv')
with open(path) as f:
    line = f.readline()
    line = f.readline()

print(line)

"Euscelis Brulle, 1832",ncbi:6826,Euscelis,/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:33208/ncbi:6072/ncbi:33213/ncbi:33317/ncbi:1206794/ncbi:88770/ncbi:6656/ncbi:197563/ncbi:197562/ncbi:6960/ncbi:50557/ncbi:85512/ncbi:7496/ncbi:33340/ncbi:33342/ncbi:7524/ncbi:1955247/ncbi:33365/ncbi:33368/ncbi:30102/ncbi:33372/ncbi:565967/ncbi:6826,NP,genus,,



In [78]:
describe_csv(path)

Unnamed: 0,synonym,taxid,sci_name,tax_path,POS,rank,_,__
count,859017,859017,859017,859017,859017,859017,0.0,0.0
unique,810220,238405,238305,238405,3,28,,
top,P. affinis,ncbi:7175,Culex pipiens,/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:...,NP,species,,
freq,22,174,174,174,855514,766080,,
mean,,,,,,,,
std,,,,,,,,
min,,,,,,,,
25%,,,,,,,,
50%,,,,,,,,
75%,,,,,,,,


In [79]:
get_csv_head(path)

Unnamed: 0,synonym,taxid,sci_name,tax_path,POS,rank,_,__
0,"Euscelis Brulle, 1832",ncbi:6826,Euscelis,/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:...,NP,genus,,
1,Euscelis Brulle,ncbi:6826,Euscelis,/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:...,NP,genus,,
2,Euscelis (Brulle),ncbi:6826,Euscelis,/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:...,NP,genus,,
3,Euscelis,ncbi:6826,Euscelis,/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:...,NP,genus,,
4,Eusceli,ncbi:6826,Euscelis,/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:...,NP,genus,,


In [80]:
pd.read_csv(path, on_bad_lines='skip')

Unnamed: 0,synonym,taxid,sci_name,tax_path,POS,rank,_,__
0,"Euscelis Brulle, 1832",ncbi:6826,Euscelis,/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:...,NP,genus,,
1,Euscelis Brulle,ncbi:6826,Euscelis,/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:...,NP,genus,,
2,Euscelis (Brulle),ncbi:6826,Euscelis,/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:...,NP,genus,,
3,Euscelis,ncbi:6826,Euscelis,/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:...,NP,genus,,
4,Eusceli,ncbi:6826,Euscelis,/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:...,NP,genus,,
...,...,...,...,...,...,...,...,...
859012,R.errabunda,ncbi:1519426,Ravinia errabunda,/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:...,NP,species,,
859013,R . errabunda,ncbi:1519426,Ravinia errabunda,/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:...,NP,species,,
859014,"Ravinia errabunda (Wulp, 1896)",ncbi:1519426,Ravinia errabunda,/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:...,NP,species,,
859015,Ravinia errabunda Wulp,ncbi:1519426,Ravinia errabunda,/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:...,NP,species,,


#### Check Data

In [81]:
for db_name, path in db_paths.items():
    dbg(len(describe_csv(path).columns))
    if (len(describe_csv(path).columns))<=2:
        dbg(path)
        display(describe_csv(path))
        print('starting csv format standardisation:')
        normalised_df = pd.read_csv(path, names=ncbi_header, index_col=0, on_bad_lines='skip', sep=ncbi_sep).reset_index(level=0)
        print('standardised:')
        display(normalised_df.head())
        new_path = path[:-4]+'.csv'
        
        normalised_df.to_csv(new_path, index=False,)
        db_paths[db_name] = new_path
    display(describe_csv(path))

[35m[32mmess: [0m5


Unnamed: 0,CommonName,Language,CodeEOPP,PreferredName,AuthorityPreferredName
count,1276,1276,1324,1324,1142
unique,1235,38,239,239,169
top,citrus leprosis,English,HELIZE,Helicoverpa zea,(Boddie)
freq,4,477,30,30,30


[35m[32mmess: [0m5


Unnamed: 0,OtherScientificNames,Authority,CodeEOPP,PreferredName,AuthorityPreferredName
count,616,453,664,664,502
unique,611,324,239,239,169
top,Tomato leaf curl New Delhi begomovirus,Corbett,BEMITA,Bemisia tabaci,(Gennadius)
freq,2,6,16,16,16


[35m[32m_27: [0m4


Unnamed: 0,Category (pest name),Keywords,Unnamed: 2,Unnamed: 3
count,7558,7558,21,3
unique,1168,7393,5,3
top,Cronartium-PHT,margarodes,Au/Cabi,Fr
freq,85,3,16,1


[35m[32mmess: [0m8


Unnamed: 0,synonym,taxid,sci_name,tax_path,POS,rank,undefined,also_undefined
count,1607725,1607726,1607726,1607726,1607726,1607726,1515574,1515573
unique,1474816,242820,242457,242820,3,40,180141,180140
top,mitosporic Ascomycota,ncbi:1888,Streptomyces albus,/ncbi:1/ncbi:131567/ncbi:2/ncbi:1783272/ncbi:2...,NP,species,ncbi:11320,Influenza A virus
freq,48,532,532,532,1583049,1446117,4350,4350


# Create LR

### (1) Preprocess original datasets

Extract relevant information from each dataset and store it in table form (one `csv` per original dataset)

## Display examples of input datasets
display(pd.read_csv(db_paths['eppo_sci']).describe())
pd.read_csv(db_paths['eppo_com']).describe()

In [82]:

db = dict() # create dictionary of databases

for db_name, columns in cc.items(): # loop over db names and respective relevant columns
    db[db_name] = dict()    # new database in db dictionary
    cc_zip = zip(columns, anno_gloss_column_names)  # match source-db column names to final-db column names

    for orig_col, targ_col in cc_zip: # loop over matches
        dbg(orig_col)
        if isinstance(orig_col, str): # if only column name specified, take column as is
            def fun(x):
                return x
        elif isinstance(orig_col,NoneType): # else, if column must not to be selected, continue
            continue
        else:    # else, if a preprocessing function has been specified, use it when extracting column
            fun = orig_col[1]
            orig_col = orig_col[0]
            
        # sep = tab (for ncbi)
        try:
            ds_col = pd.read_csv(db_paths[db_name], on_bad_lines='skip')[orig_col]
        except:
            ds_col = pd.read_csv(db_paths[db_name], names=ncbi_header, index_col=0, on_bad_lines='skip', sep=ncbi_sep).reset_index(level=0)[orig_col]
            
        db[db_name][targ_col] = list(map(fun, ds_col)) # new_db col = old_db col
    
    pd.DataFrame.from_dict(db[db_name]).to_csv(os.path.join(annotation_path, db_name+'_table.csv'), index=False)


[35m[32morig_col: [0mCodeEOPP
[35m[32m__package__: [0mNone
[35m[32m__package__: [0mNone
[35m[32morig_col: [0mPreferredName
[35m[32morig_col: [0mCommonName
[35m[32m__package__: [0mNone
[35m[32morig_col: [0m['Language', <function get_lang_code at 0x7f732e710d30>]
[35m[32m__package__: [0mNone
[35m[32morig_col: [0mCodeEOPP
[35m[32m__package__: [0mNone
[35m[32m__package__: [0mNone
[35m[32morig_col: [0mPreferredName
[35m[32morig_col: [0mOtherScientificNames
[35m[32m__package__: [0mNone
[35m[32m__package__: [0mNone
[35m[32m__package__: [0mNone
[35m[32m__package__: [0mNone
[35m[32morig_col: [0mCategory (pest name)
[35m[32m__package__: [0mNone
[35m[32morig_col: [0m['Category (pest name)', <function PHT_to_lower_taxon at 0x7f731bd4ae60>]
[35m[32m__package__: [0mNone
[35m[32morig_col: [0m['Keywords', <function to_pattern at 0x7f731bd484c0>]
[35m[32m__package__: [0mNone
[35m[32morig_col: [0mUnnamed: 3
[35m[32m__package__: [

### (2) Join Datasets

In [83]:
prep_dbs = dict()
for db_name in cc.keys():
    prep_dbs[db_name] = pd.read_csv(os.path.join(annotation_path, db_name + '_table.csv'), on_bad_lines='skip')

In [84]:
prep_dbs['ncbi'].head()

Unnamed: 0,NCBI_TAXID,preferred_name,synonym
0,ncbi:2,Bacteria,Bacteria
1,ncbi:2,Bacteria,bacteria
2,ncbi:2,Bacteria,eubacteria
3,ncbi:2,Bacteria,Monera
4,ncbi:2,Bacteria,Procaryotae


#### A concatenated version

In [85]:
len(list(prep_dbs.items()))

4

In [86]:
list(prep_dbs.items())[0][1]

Unnamed: 0,EPPO_CODE,preferred_name,synonym,ds_language
0,LIBEAF,'Candidatus Liberibacter africanus',greening of citrus,en
1,LIBEAF,'Candidatus Liberibacter africanus',greening des agrumes,fr
2,LIBEAF,'Candidatus Liberibacter africanus',virescence des agrumes,fr
3,LIBEAF,'Candidatus Liberibacter africanus',enverdecimiento de los cítricos,es
4,LIBEAM,'Candidatus Liberibacter americanus',Brazilian citrus greening,en
...,...,...,...,...
1319,PHYP64,Grapevine flavescence dorée phytoplasma,rougeau,fr
1320,PHYP64,Grapevine flavescence dorée phytoplasma,flavescencia dorada de la vid,es
1321,PHYP64,Grapevine flavescence dorée phytoplasma,flavescenza dorata de la vite,it
1322,PHYP64,Grapevine flavescence dorée phytoplasma,Фитоплазма золотистого пожелтения винограда,ru


In [87]:
conc_list = list(prep_dbs.values())
conc_list[-1] = conc_list[-1].head()

In [88]:
concat_df = pd.concat(
    conc_list,
    ignore_index=True)
concat_df.to_csv('concatenated_glossary.csv')
len(concat_df)

9551

In [89]:
concat_df.head()

Unnamed: 0,EPPO_CODE,preferred_name,synonym,ds_language,EFSA_PHT,re,match_language,NCBI_TAXID
0,LIBEAF,'Candidatus Liberibacter africanus',greening of citrus,en,,,,
1,LIBEAF,'Candidatus Liberibacter africanus',greening des agrumes,fr,,,,
2,LIBEAF,'Candidatus Liberibacter africanus',virescence des agrumes,fr,,,,
3,LIBEAF,'Candidatus Liberibacter africanus',enverdecimiento de los cítricos,es,,,,
4,LIBEAM,'Candidatus Liberibacter americanus',Brazilian citrus greening,en,,,,


#### Create empty DF with col names

In [90]:
structured_db = pd.DataFrame(columns=anno_gloss_column_names)
display(structured_db)

Unnamed: 0,EPPO_CODE,EFSA_PHT,NCBI_TAXID,preferred_name,synonym,re,ds_language,match_language


df1 = pd.DataFrame([[3,4],[5,6]], columns=['a','b'])
df2 = pd.DataFrame([[5,6],[5,6]], columns=['a','c'])
display(df1)
pd.concat([df1,df2], ignore_index=True)

#### Add EPPO info to final DB

In [91]:
db_names = list(str(x) for x in prep_dbs.keys()) # names of EPPO databases

In [92]:
# copy into a new db to which eppo information will be added 
ncbi_eppo_db = structured_db.copy()
prep_ncbi_db = prep_dbs['ncbi']

## open both eppo (sci and com) dbs and concatenate them
prep_eppo_dbs = [prep_dbs[db_name] for db_name in db_names if 'eppo' in db_name]
prep_eppo_db = pd.concat(prep_eppo_dbs, ignore_index=True)
display(prep_eppo_db.head())

Unnamed: 0,EPPO_CODE,preferred_name,synonym,ds_language
0,LIBEAF,'Candidatus Liberibacter africanus',greening of citrus,en
1,LIBEAF,'Candidatus Liberibacter africanus',greening des agrumes,fr
2,LIBEAF,'Candidatus Liberibacter africanus',virescence des agrumes,fr
3,LIBEAF,'Candidatus Liberibacter africanus',enverdecimiento de los cítricos,es
4,LIBEAM,'Candidatus Liberibacter americanus',Brazilian citrus greening,en


In [93]:
## loop over syns in EPPO db
for prep_eppo_db_i in tqdm(range(len(prep_eppo_db))):
    syn = str(prep_eppo_db['synonym'][prep_eppo_db_i]) 
    lang = prep_eppo_db['ds_language'][prep_eppo_db_i]
    key = prep_eppo_db['EPPO_CODE'][prep_eppo_db_i]
    sci_name = prep_eppo_db['preferred_name'][prep_eppo_db_i]
    
    prep_ncbi_syn_rows = prep_ncbi_db[prep_ncbi_db['synonym']==syn]
    prep_ncbi_sci_rows = prep_ncbi_db[(prep_ncbi_db['preferred_name']==sci_name) | (prep_ncbi_db['preferred_name']=="'"+sci_name+"'")]
    
    # rows in NCBI that have a matching synonym or scientific name
    prep_ncbi_matching_rows = pd.concat([prep_ncbi_syn_rows, prep_ncbi_sci_rows], ignore_index=True)
    
    taxid_col = (prep_ncbi_matching_rows['NCBI_TAXID'])
    sci_name_col = prep_ncbi_matching_rows['preferred_name']
    
    if len(prep_ncbi_matching_rows) != 0:
        taxid = taxid_col[0]
        sci_name = sci_name_col[0]
    else:
        taxid = ''
        sci_name = prep_eppo_db['preferred_name'][prep_eppo_db_i]
    
    new_eppo_row = pd.DataFrame([[taxid, sci_name, syn, lang, key]], columns=['NCBI_TAXID', 'preferred_name', 'synonym', 'ds_language', 'EPPO_CODE'])
    
    ncbi_eppo_db = pd.concat([ncbi_eppo_db, new_eppo_row], ignore_index=True)

100%|██████████| 1988/1988 [13:55<00:00,  2.38it/s]


In [94]:
ncbi_eppo_db.to_csv('output.csv')

for i in range(10):
    print('i is:'+str(i))
    for j in range(10):
        print(j)
        if j%5 == 0 and j>0:
            print("break")
            break

#### Add EFSA info to final DB

In [95]:

# copy into a new db to which eppo information will be added 
final_db = ncbi_eppo_db.copy()

## open both eppo (sci and com) dbs and concatenate them
prep_efsa_dbs = [prep_dbs[db_name] for db_name in db_names if 'efsa' in db_name]
prep_efsa_db = pd.concat(prep_efsa_dbs, ignore_index=True)
display(prep_efsa_db.head())
len(prep_efsa_db)

Unnamed: 0,EFSA_PHT,preferred_name,re,match_language
0,AcaloleptaSejuncta-PHT,acalolepta sejuncta,acalolepta sejuncta,
1,AcalymmaVittatum-PHT,acalymma vittatum,acalymma vittata,
2,AcalymmaVittatum-PHT,acalymma vittatum,acalymma vittatum,
3,AcalymmaVittatum-PHT,acalymma vittatum,chrysom\wle\w* ray\we du concombre,Fr
4,AcalymmaVittatum-PHT,acalymma vittatum,cistela melanocephala,


7558

In [96]:
match_found = False
## loop over res i n EFSA db
for prep_efsa_db_i in tqdm(range(len(prep_efsa_db))):
    for final_db_i in range(len(final_db)):
        pattern = prep_efsa_db['re'][prep_efsa_db_i]
        syn = final_db['synonym'][final_db_i]
        
        ## if re matches a syn, add: re, efsa key, language if None
        if re.match(str(pattern), str(syn)):
            final_db['re'][final_db_i] = prep_efsa_db['re'][prep_efsa_db_i]
            final_db['EFSA_PHT'][final_db_i] = prep_efsa_db['EFSA_PHT'][prep_efsa_db_i]
            final_db['match_language'][final_db_i] = prep_efsa_db['match_language'][prep_efsa_db_i]
            match_found = True
        
    ## else, if no matches, add new line = re, syn, key, language
    if not match_found:
        pattern = prep_efsa_db['re'][prep_efsa_db_i]
        #syn = prep_efsa_db['synonym'][prep_efsa_db_i]
        key = prep_efsa_db['EFSA_PHT'][prep_efsa_db_i]
        lang = prep_efsa_db['match_language'][prep_efsa_db_i]
        
        new_efsa_row = pd.DataFrame([[pattern, syn, lang, key]], columns=['re', 'synonym', 'match_language', 'EFSA_PHT'])
        
        final_db = pd.concat([final_db, new_efsa_row], ignore_index=True)
            
final_db.head()

100%|██████████| 7558/7558 [04:45<00:00, 26.50it/s]


Unnamed: 0,EPPO_CODE,EFSA_PHT,NCBI_TAXID,preferred_name,synonym,re,ds_language,match_language
0,LIBEAF,LiberibacterAfricanusAsiaticusAmericanus-PHT,,'Candidatus Liberibacter africanus',greening of citrus,greening of citrus,en,
1,LIBEAF,LiberibacterAfricanusAsiaticusAmericanus-PHT,,'Candidatus Liberibacter africanus',greening des agrumes,greening des agrumes,fr,
2,LIBEAF,LiberibacterAfricanusAsiaticusAmericanus-PHT,,'Candidatus Liberibacter africanus',virescence des agrumes,vir\wscence des agrumes,fr,
3,LIBEAF,LiberibacterAfricanusAsiaticusAmericanus-PHT,,'Candidatus Liberibacter africanus',enverdecimiento de los cítricos,enverdecimiento de los c\wtricos,es,
4,LIBEAM,,,'Candidatus Liberibacter americanus',Brazilian citrus greening,,en,


#### add NCBI info in incomplete EFSA lines

In [122]:
## Iterate over final db lines 
for final_db_i in tqdm(range(len(final_db))):
    pattern = final_db.at[final_db_i,'re']
    # check if row has only efsa info
    if pattern and pd.isna(final_db.at[final_db_i,'NCBI_TAXID']):
        # iterate through NCBI to find an item that corresponds to pattern
        for prep_ncbi_db_i in range(len(prep_ncbi_db)):
            syn = prep_ncbi_db.at[prep_ncbi_db_i,'synonym']
            sci_name = prep_ncbi_db.at[prep_ncbi_db_i,'preferred_name']
            taxid = prep_ncbi_db.at[prep_ncbi_db_i,'NCBI_TAXID']
            if re.match(str(pattern), str(syn)):
                dbg(syn,'MATCH:'+pattern)
                final_db.at[final_db_i,'synonym'] = syn
            if re.match(str(pattern), str(syn)) or re.match(str(pattern), str(sci_name)):
                dbg(sci_name,'MATCH:'+pattern)
                final_db.at[final_db_i,'NCBI_TAXID'] = taxid
                final_db.at[final_db_i,'preferred_name'] = sci_name
                break
        print('checked all NCBI for '+pattern)
final_db.head()

 92%|█████████▏| 1989/2164 [00:40<00:03, 49.43it/s]

checked all NCBI for acalolepta sejuncta


 92%|█████████▏| 1990/2164 [01:21<00:08, 20.13it/s]

checked all NCBI for acalymma vittata


 92%|█████████▏| 1991/2164 [02:02<00:15, 10.87it/s]

checked all NCBI for acalymma vittatum


 92%|█████████▏| 1992/2164 [02:44<00:26,  6.56it/s]

checked all NCBI for chrysom\wle\w* ray\we du concombre


 92%|█████████▏| 1993/2164 [03:24<00:40,  4.22it/s]

checked all NCBI for cistela melanocephala


 92%|█████████▏| 1994/2164 [04:05<01:00,  2.81it/s]

checked all NCBI for cizgili hiyar bocegi


 92%|█████████▏| 1995/2164 [04:47<01:30,  1.87it/s]

checked all NCBI for crioceris vittata


 92%|█████████▏| 1996/2164 [05:34<02:16,  1.23it/s]

checked all NCBI for cryptocephalus americanus


 92%|█████████▏| 1997/2164 [06:16<03:13,  1.16s/it]

checked all NCBI for cryptocephalus stolatus


 92%|█████████▏| 1998/2164 [06:56<04:30,  1.63s/it]

checked all NCBI for diabrotica americana


 92%|█████████▏| 1999/2164 [07:38<06:19,  2.30s/it]

checked all NCBI for diabrotica melanocephala


 92%|█████████▏| 2000/2164 [08:20<08:49,  3.23s/it]

checked all NCBI for diabrotica vittata


 92%|█████████▏| 2001/2164 [09:01<12:08,  4.47s/it]

checked all NCBI for diabrotica vittatae


 93%|█████████▎| 2002/2164 [09:45<16:44,  6.20s/it]

checked all NCBI for escarabajo de las cucurbit\wceas


 93%|█████████▎| 2003/2164 [10:29<22:47,  8.49s/it]

checked all NCBI for galeruca cucumeris


 93%|█████████▎| 2004/2164 [11:09<29:09, 10.93s/it]

checked all NCBI for galeruca vittata


 93%|█████████▎| 2005/2164 [11:50<36:52, 13.92s/it]

checked all NCBI for galleruca americana


 93%|█████████▎| 2006/2164 [12:30<45:26, 17.26s/it]

checked all NCBI for galleruca pallipes


 93%|█████████▎| 2007/2164 [12:51<46:41, 17.85s/it]

[35mMATCH:striped cucumber beetle\w*[32msyn: [0mstriped cucumber beetle
[35mMATCH:striped cucumber beetle\w*[32msci_name: [0mAcalymma vittatum
checked all NCBI for striped cucumber beetle\w*


 93%|█████████▎| 2008/2164 [13:32<56:53, 21.88s/it]

checked all NCBI for tortuguilla\w* de las cucurbit\wceas


 93%|█████████▎| 2009/2164 [14:12<1:06:17, 25.66s/it]

checked all NCBI for acanthococcus pohutukawa


 93%|█████████▎| 2010/2164 [14:52<1:14:23, 28.99s/it]

checked all NCBI for acantholyda nipponica


 93%|█████████▎| 2011/2164 [15:32<1:20:08, 31.43s/it]

checked all NCBI for lyda nipponica


 93%|█████████▎| 2012/2164 [16:09<1:23:52, 33.11s/it]

checked all NCBI for acaphylla theae


 93%|█████████▎| 2013/2164 [16:48<1:26:42, 34.45s/it]

checked all NCBI for acarien couleur chair


 93%|█████████▎| 2014/2164 [17:26<1:28:38, 35.46s/it]

checked all NCBI for acarien rose


 93%|█████████▎| 2015/2164 [18:03<1:29:38, 36.10s/it]

checked all NCBI for pink mite


 93%|█████████▎| 2016/2164 [18:41<1:30:05, 36.52s/it]

checked all NCBI for pink tea mite


 93%|█████████▎| 2017/2164 [19:18<1:29:32, 36.54s/it]

checked all NCBI for rosafarbene teegallmilbe


 93%|█████████▎| 2018/2164 [19:55<1:29:57, 36.97s/it]

checked all NCBI for roze theemijt


 93%|█████████▎| 2019/2164 [20:33<1:29:59, 37.24s/it]

checked all NCBI for vleeschkleurige theemijt


 93%|█████████▎| 2020/2164 [21:11<1:29:38, 37.35s/it]

checked all NCBI for acarien du figuier


 93%|█████████▎| 2021/2164 [21:49<1:29:28, 37.54s/it]

checked all NCBI for eriof\wdeo\S*\s?da\S*\s?figueira


 93%|█████████▎| 2022/2164 [22:27<1:29:32, 37.83s/it]

checked all NCBI for feigengallmilbe


 93%|█████████▎| 2023/2164 [23:05<1:28:38, 37.72s/it]

checked all NCBI for fig blister mite


 94%|█████████▎| 2024/2164 [23:43<1:28:09, 37.78s/it]

checked all NCBI for fig mite


 94%|█████████▎| 2025/2164 [24:20<1:27:09, 37.62s/it]

checked all NCBI for fig rust mite


 94%|█████████▎| 2026/2164 [24:58<1:26:26, 37.58s/it]

checked all NCBI for aceria kuko


 94%|█████████▎| 2027/2164 [25:36<1:26:20, 37.81s/it]

checked all NCBI for goji gall mite\w*


 94%|█████████▎| 2028/2164 [26:16<1:27:07, 38.44s/it]

checked all NCBI for goji-gallmilbe


 94%|█████████▍| 2029/2164 [26:55<1:26:41, 38.53s/it]

checked all NCBI for vlnovn\wk kuko


 94%|█████████▍| 2030/2164 [27:33<1:26:17, 38.64s/it]

checked all NCBI for галообразуващ ериофиден акар


 94%|█████████▍| 2031/2164 [28:13<1:26:06, 38.84s/it]

checked all NCBI for \wcaro erinoso de lichi


 94%|█████████▍| 2032/2164 [28:55<1:27:40, 39.85s/it]

checked all NCBI for aceria litchii


 94%|█████████▍| 2033/2164 [29:35<1:27:14, 39.96s/it]

checked all NCBI for eriophyes litchii


 94%|█████████▍| 2034/2164 [30:16<1:27:01, 40.16s/it]

checked all NCBI for litchi mite\w*


 94%|█████████▍| 2035/2164 [30:55<1:25:51, 39.94s/it]

checked all NCBI for vlnovn\wk li\wiov\w


 94%|█████████▍| 2036/2164 [31:35<1:24:52, 39.79s/it]

checked all NCBI for escargot g\want africain


 94%|█████████▍| 2037/2164 [32:13<1:23:33, 39.47s/it]

checked all NCBI for acatina africana


 94%|█████████▍| 2038/2164 [32:49<1:20:33, 38.36s/it]

checked all NCBI for acatina dell' Isola maurizio


 94%|█████████▍| 2039/2164 [33:25<1:18:29, 37.67s/it]

checked all NCBI for achatina fulica


 94%|█████████▍| 2040/2164 [34:01<1:16:33, 37.04s/it]

checked all NCBI for achatine de madagascar


 94%|█████████▍| 2041/2164 [34:36<1:14:59, 36.58s/it]

checked all NCBI for achatine foulque


 94%|█████████▍| 2042/2164 [35:12<1:13:54, 36.35s/it]

checked all NCBI for achatine mauritanienne


 94%|█████████▍| 2043/2164 [35:48<1:12:49, 36.11s/it]

checked all NCBI for african giant snail


 94%|█████████▍| 2044/2164 [36:23<1:11:50, 35.92s/it]

checked all NCBI for afrikaanse reuzenslak


 95%|█████████▍| 2045/2164 [36:59<1:11:05, 35.85s/it]

checked all NCBI for afrikanische riesenschnecke


 95%|█████████▍| 2046/2164 [37:35<1:10:35, 35.89s/it]

checked all NCBI for agaatslak


 95%|█████████▍| 2047/2164 [38:10<1:09:49, 35.81s/it]

checked all NCBI for babosa gigante africana


 95%|█████████▍| 2048/2164 [38:47<1:09:23, 35.89s/it]

checked all NCBI for caracol gigante africano


 95%|█████████▍| 2049/2164 [39:22<1:08:45, 35.87s/it]

checked all NCBI for caramujo gigante


 95%|█████████▍| 2050/2164 [39:58<1:07:49, 35.69s/it]

checked all NCBI for escargot g\want africain


 95%|█████████▍| 2051/2164 [40:33<1:06:53, 35.52s/it]

checked all NCBI for giant african snail


 95%|█████████▍| 2052/2164 [41:08<1:06:11, 35.46s/it]

checked all NCBI for gran caracol africano


 95%|█████████▍| 2053/2164 [41:44<1:05:42, 35.52s/it]

checked all NCBI for große achatschnecke


 95%|█████████▍| 2054/2164 [42:19<1:05:06, 35.52s/it]

checked all NCBI for grote afrikaanse slak


 95%|█████████▍| 2055/2164 [42:55<1:04:28, 35.49s/it]

checked all NCBI for kalutara snail


 95%|█████████▌| 2056/2164 [43:31<1:04:06, 35.61s/it]

checked all NCBI for lissachatina fulica


 95%|█████████▌| 2057/2164 [44:06<1:03:27, 35.59s/it]

checked all NCBI for vanlig afrikansk jättelandsnäcka


 95%|█████████▌| 2058/2164 [44:41<1:02:40, 35.48s/it]

checked all NCBI for гигантская африканская улитка


 95%|█████████▌| 2059/2164 [45:17<1:02:12, 35.55s/it]

checked all NCBI for eurycreon rantalis


 95%|█████████▌| 2060/2164 [45:52<1:01:27, 35.46s/it]

checked all NCBI for garden webworm\w*


 95%|█████████▌| 2061/2164 [46:28<1:01:04, 35.58s/it]

checked all NCBI for loxostege rantalis


 95%|█████████▌| 2062/2164 [47:04<1:00:32, 35.61s/it]

checked all NCBI for phlyctaenodes rantalis


 95%|█████████▌| 2063/2164 [47:40<1:00:05, 35.69s/it]

checked all NCBI for pseudomonas avenae ssp citrulli


 95%|█████████▌| 2064/2164 [48:16<59:33, 35.73s/it]  

checked all NCBI for acidovorax avenae subsp. citrulli


 95%|█████████▌| 2065/2164 [48:51<58:54, 35.70s/it]

checked all NCBI for pseudomonas avenae subsp. citrulli


 95%|█████████▌| 2066/2164 [49:26<58:05, 35.56s/it]

checked all NCBI for pseudomonas pseudoalcaligenes subsp. citrulli


 96%|█████████▌| 2067/2164 [50:02<57:32, 35.59s/it]

checked all NCBI for pseudomonas sp. nrrl b\w12227


 96%|█████████▌| 2068/2164 [50:37<56:42, 35.44s/it]

checked all NCBI for pseudomonas sp. nrrlb12227


 96%|█████████▌| 2069/2164 [51:13<56:11, 35.49s/it]

checked all NCBI for acidivorax citrulli


 96%|█████████▌| 2070/2164 [51:48<55:28, 35.41s/it]

checked all NCBI for acidovorax avenae subsp\w* citrulli


 96%|█████████▌| 2071/2164 [52:23<54:53, 35.42s/it]

checked all NCBI for acidovorax citrulli


 96%|█████████▌| 2072/2164 [52:59<54:12, 35.36s/it]

checked all NCBI for atcc 29625


 96%|█████████▌| 2073/2164 [53:34<53:40, 35.39s/it]

checked all NCBI for ccug 17393


 96%|█████████▌| 2074/2164 [54:10<53:18, 35.53s/it]

checked all NCBI for cfbp 4459


 96%|█████████▌| 2075/2164 [54:45<52:37, 35.47s/it]

checked all NCBI for cip 106436


 96%|█████████▌| 2076/2164 [55:21<52:08, 35.55s/it]

checked all NCBI for ibsbf 1851


 96%|█████████▌| 2077/2164 [55:57<51:35, 35.58s/it]

checked all NCBI for icmp 7500


 96%|█████████▌| 2078/2164 [56:33<51:06, 35.66s/it]

checked all NCBI for icpb 30064


 96%|█████████▌| 2079/2164 [57:08<50:20, 35.53s/it]

checked all NCBI for lmg 5376


 96%|█████████▌| 2080/2164 [57:43<49:30, 35.36s/it]

checked all NCBI for mancha bacteriana de los frutos


 96%|█████████▌| 2081/2164 [58:19<49:06, 35.49s/it]

checked all NCBI for ncppb 3679


 96%|█████████▌| 2082/2164 [58:54<48:36, 35.56s/it]

checked all NCBI for pseudomonas avenae s\w* citrulli


 96%|█████████▋| 2083/2164 [59:29<47:50, 35.43s/it]

checked all NCBI for pseudomonas avenae subsp\w* citrulli


 96%|█████████▋| 2084/2164 [1:00:05<47:27, 35.59s/it]

checked all NCBI for pseudomonas pseudoalcaligenes citrulli


 96%|█████████▋| 2085/2164 [1:00:41<46:51, 35.59s/it]

checked all NCBI for pseudomonas pseudoalcaligenes subsp\w* citrulli


 96%|█████████▋| 2086/2164 [1:01:17<46:22, 35.67s/it]

checked all NCBI for pseudomonas sp\w* nrrl\S*\s?b\S*\s?12227


 96%|█████████▋| 2087/2164 [1:01:52<45:36, 35.53s/it]

checked all NCBI for strain fc\w247


 96%|█████████▋| 2088/2164 [1:02:28<45:08, 35.64s/it]

checked all NCBI for бактериални петна по динята


 97%|█████████▋| 2089/2164 [1:03:03<44:29, 35.60s/it]

checked all NCBI for Бактериальная пятнистость тыквенных культур


 97%|█████████▋| 2090/2164 [1:03:39<43:58, 35.65s/it]

checked all NCBI for aclerda berlesii


 97%|█████████▋| 2091/2164 [1:04:15<43:19, 35.61s/it]

checked all NCBI for a\w* gloveran\w*


 97%|█████████▋| 2092/2164 [1:04:51<42:48, 35.68s/it]

checked all NCBI for acleris gloverana


 97%|█████████▋| 2093/2164 [1:05:26<42:07, 35.60s/it]

checked all NCBI for oruga cabecinegra occidental de las yemas


 97%|█████████▋| 2094/2164 [1:06:01<41:24, 35.49s/it]

checked all NCBI for western black\wheaded bud worm\w*


 97%|█████████▋| 2095/2164 [1:06:37<40:45, 35.44s/it]

checked all NCBI for western blackheaded budworm\w*


 97%|█████████▋| 2096/2164 [1:07:12<40:05, 35.38s/it]

checked all NCBI for западная черноголовая листовертка


 97%|█████████▋| 2097/2164 [1:07:47<39:30, 35.39s/it]

checked all NCBI for acleris issikii


 97%|█████████▋| 2098/2164 [1:08:23<38:57, 35.42s/it]

checked all NCBI for a. minuta


 97%|█████████▋| 2099/2164 [1:08:58<38:16, 35.34s/it]

checked all NCBI for acalla minuta


 97%|█████████▋| 2100/2164 [1:09:33<37:45, 35.40s/it]

checked all NCBI for acleris minuta


 97%|█████████▋| 2101/2164 [1:10:09<37:05, 35.32s/it]

checked all NCBI for lesser apple\w* leaf\w* folder\w*


 97%|█████████▋| 2102/2164 [1:10:44<36:26, 35.26s/it]

checked all NCBI for peronea minuta


 97%|█████████▋| 2103/2164 [1:11:19<36:01, 35.43s/it]

checked all NCBI for tordeuse\w* \w t\wte jaune


 97%|█████████▋| 2104/2164 [1:11:55<35:22, 35.38s/it]

checked all NCBI for yellow\S*\s?headed cranberr\w* worm\w*


 97%|█████████▋| 2105/2164 [1:12:30<34:50, 35.44s/it]

checked all NCBI for yellow\S*\s?headed fire worm\w*


 97%|█████████▋| 2106/2164 [1:13:06<34:15, 35.43s/it]

checked all NCBI for acleris nishidai


 97%|█████████▋| 2107/2164 [1:13:42<33:47, 35.57s/it]

checked all NCBI for acleris nivisellana


 97%|█████████▋| 2108/2164 [1:14:17<33:11, 35.56s/it]

checked all NCBI for acleris robinsoniana


 97%|█████████▋| 2109/2164 [1:14:52<32:28, 35.43s/it]

checked all NCBI for acleris semipurpurana


 98%|█████████▊| 2110/2164 [1:15:28<31:52, 35.42s/it]

checked all NCBI for croesia semipurpurana


 98%|█████████▊| 2111/2164 [1:16:03<31:20, 35.48s/it]

checked all NCBI for oak leaf ti\w*


 98%|█████████▊| 2112/2164 [1:16:39<30:44, 35.47s/it]

checked all NCBI for oak leafshredd\w*


 98%|█████████▊| 2113/2164 [1:17:14<30:09, 35.48s/it]

checked all NCBI for oak leaft\w*


 98%|█████████▊| 2114/2164 [1:17:50<29:35, 35.50s/it]

checked all NCBI for acleris senescens


 98%|█████████▊| 2115/2164 [1:18:25<28:56, 35.45s/it]

checked all NCBI for a\w* variana


 98%|█████████▊| 2116/2164 [1:19:00<28:17, 35.37s/it]

checked all NCBI for acleris variana


 98%|█████████▊| 2117/2164 [1:19:36<27:40, 35.34s/it]

checked all NCBI for eastern black\S*\s?headed budworm\w*


 98%|█████████▊| 2118/2164 [1:20:11<27:11, 35.46s/it]

checked all NCBI for hemlock budworm\w*


 98%|█████████▊| 2119/2164 [1:20:47<26:36, 35.47s/it]

checked all NCBI for oruga cabecinegra de las yemas de la picea


 98%|█████████▊| 2120/2164 [1:21:22<26:00, 35.46s/it]

checked all NCBI for tordeuse\w* \w t\wte noire d\S*\s?\wpinette


 98%|█████████▊| 2121/2164 [1:21:58<25:23, 35.44s/it]

checked all NCBI for восточная черноголовая листовертка


 98%|█████████▊| 2122/2164 [1:22:34<24:54, 35.58s/it]

checked all NCBI for a citrina


 98%|█████████▊| 2123/2164 [1:23:10<24:24, 35.71s/it]

checked all NCBI for acrogonia citrina


 98%|█████████▊| 2124/2164 [1:23:45<23:50, 35.75s/it]

checked all NCBI for a bicolor


 98%|█████████▊| 2125/2164 [1:24:21<23:15, 35.79s/it]

checked all NCBI for a virescens


 98%|█████████▊| 2126/2164 [1:24:57<22:40, 35.80s/it]

checked all NCBI for acrogonia bicolor


 98%|█████████▊| 2127/2164 [1:25:32<21:59, 35.68s/it]

checked all NCBI for acrogonia virescens


 98%|█████████▊| 2128/2164 [1:26:08<21:22, 35.63s/it]

checked all NCBI for actinidia virus c


 98%|█████████▊| 2129/2164 [1:26:44<20:46, 35.60s/it]

checked all NCBI for \wcaro de la agalla de la fuchsia


 98%|█████████▊| 2130/2164 [1:27:19<20:05, 35.45s/it]

checked all NCBI for aculops fuchsiae


 98%|█████████▊| 2131/2164 [1:27:54<19:29, 35.44s/it]

checked all NCBI for brazilian fuchsia mite


 99%|█████████▊| 2132/2164 [1:28:30<18:56, 35.52s/it]

checked all NCBI for fuchsia\S*\s?galmijt


 99%|█████████▊| 2133/2164 [1:29:05<18:21, 35.53s/it]

checked all NCBI for fuchsia gall mite\w*


 99%|█████████▊| 2134/2164 [1:29:41<17:46, 35.56s/it]

checked all NCBI for fuchsia mite


 99%|█████████▊| 2135/2164 [1:30:17<17:11, 35.59s/it]

checked all NCBI for galle du fuchsia


 99%|█████████▊| 2136/2164 [1:30:52<16:35, 35.55s/it]

checked all NCBI for phytopte\w* d\w* fuchsia


 99%|█████████▉| 2137/2164 [1:31:28<16:00, 35.57s/it]

checked all NCBI for phytopte du fuchsia


 99%|█████████▉| 2138/2164 [1:32:03<15:23, 35.51s/it]

checked all NCBI for vlnovn\wk fuchsiov\w


 99%|█████████▉| 2139/2164 [1:32:39<14:50, 35.62s/it]

checked all NCBI for acute oak decline


 99%|█████████▉| 2140/2164 [1:33:14<14:14, 35.59s/it]

checked all NCBI for ad\wlgido lanudo del falso abeto


 99%|█████████▉| 2141/2164 [1:33:49<13:34, 35.42s/it]

checked all NCBI for adelges funitectus


 99%|█████████▉| 2142/2164 [1:34:25<13:02, 35.58s/it]

checked all NCBI for adelges tsugae


 99%|█████████▉| 2143/2164 [1:35:01<12:29, 35.69s/it]

checked all NCBI for aphrastasia funitecta


 99%|█████████▉| 2144/2164 [1:35:38<11:57, 35.86s/it]

checked all NCBI for chermes tsugae


 99%|█████████▉| 2145/2164 [1:36:13<11:18, 35.71s/it]

checked all NCBI for hemlock woolly aphid


 99%|█████████▉| 2146/2164 [1:36:49<10:42, 35.68s/it]

checked all NCBI for hemlock wooly adelgid


 99%|█████████▉| 2147/2164 [1:37:24<10:06, 35.68s/it]

checked all NCBI for hemlockstannen


 99%|█████████▉| 2148/2164 [1:38:00<09:28, 35.56s/it]

checked all NCBI for hemlockstannenlaus


 99%|█████████▉| 2149/2164 [1:38:35<08:53, 35.57s/it]

checked all NCBI for korovnice jedlovcov\w


 99%|█████████▉| 2150/2164 [1:39:11<08:17, 35.51s/it]

checked all NCBI for puceron lanig\wre de la pruche


 99%|█████████▉| 2151/2164 [1:39:46<07:43, 35.62s/it]

checked all NCBI for pulg\wn lan\wgero de la tsuga


 99%|█████████▉| 2152/2164 [1:40:22<07:05, 35.47s/it]

checked all NCBI for pulg\wn lan\wgero del abeto


 99%|█████████▉| 2153/2164 [1:40:58<06:34, 35.82s/it]

checked all NCBI for tannenlaus


100%|█████████▉| 2154/2164 [1:41:34<05:58, 35.84s/it]

checked all NCBI for aeolesthes sarta


100%|█████████▉| 2155/2164 [1:42:10<05:23, 35.90s/it]

checked all NCBI for city longhorn beetle


100%|█████████▉| 2156/2164 [1:42:46<04:46, 35.84s/it]

checked all NCBI for sart longhorn beetle


100%|█████████▉| 2157/2164 [1:43:21<04:09, 35.64s/it]

checked all NCBI for town longhorn beetle


100%|█████████▉| 2158/2164 [1:43:57<03:34, 35.78s/it]

checked all NCBI for uzbek longhorn beetle


100%|█████████▉| 2159/2164 [1:44:33<02:59, 35.80s/it]

checked all NCBI for agonoxena argaula


100%|█████████▉| 2160/2164 [1:45:09<02:22, 35.74s/it]

checked all NCBI for coconut flat moth\w*


100%|█████████▉| 2161/2164 [1:45:44<01:46, 35.66s/it]

checked all NCBI for coconut leaf miner\w*


100%|█████████▉| 2162/2164 [1:46:19<01:10, 35.48s/it]

checked all NCBI for papillon\w* des spathes du cocotier


100%|█████████▉| 2163/2164 [1:46:54<00:35, 35.38s/it]

checked all NCBI for polilla plana del coco


100%|██████████| 2164/2164 [1:47:29<00:00,  2.98s/it]

checked all NCBI for small coconut leaf moth\w*





Unnamed: 0,EPPO_CODE,EFSA_PHT,NCBI_TAXID,preferred_name,synonym,re,ds_language,match_language
0,LIBEAF,LiberibacterAfricanusAsiaticusAmericanus-PHT,,'Candidatus Liberibacter africanus',greening of citrus,greening of citrus,en,
1,LIBEAF,LiberibacterAfricanusAsiaticusAmericanus-PHT,,'Candidatus Liberibacter africanus',greening des agrumes,greening des agrumes,fr,
2,LIBEAF,LiberibacterAfricanusAsiaticusAmericanus-PHT,,'Candidatus Liberibacter africanus',virescence des agrumes,vir\wscence des agrumes,fr,
3,LIBEAF,LiberibacterAfricanusAsiaticusAmericanus-PHT,,'Candidatus Liberibacter africanus',enverdecimiento de los cítricos,enverdecimiento de los c\wtricos,es,
4,LIBEAM,,,'Candidatus Liberibacter americanus',Brazilian citrus greening,,en,
...,...,...,...,...,...,...,...,...
2159,,AgonoxenaArgaula-PHT,,,Tomato leaf curl New Delhi begomovirus,coconut flat moth\w*,,
2160,,AgonoxenaArgaula-PHT,,,Tomato leaf curl New Delhi begomovirus,coconut leaf miner\w*,,
2161,,AgonoxenaArgaula-PHT,,,Tomato leaf curl New Delhi begomovirus,papillon\w* des spathes du cocotier,,
2162,,AgonoxenaArgaula-PHT,,,Tomato leaf curl New Delhi begomovirus,polilla plana del coco,,


In [166]:
column_names = ['EPPO_CODE','EFSA_PHT','NCBI_TAXID','name']
column_names = ['a','b','c','name']
df = pd.DataFrame([['x','y',None,'tizio'],[None,'y','z','caio'],['w',None,None,'sempronio']], columns=column_names)
df_out = pd.DataFrame([['x','y','z','tizio'],['x','y','z','caio'],['w',None,None,'sempronio']], columns=column_names)
print('INPUT:')
display(df)
print('The output I want:')
display(df_out)
print('Output of my attempt:')
df.groupby(['a','b','c'], as_index=False).agg({'a' : 'max', 'b' : 'max', 'c' : 'max'})

INPUT:


Unnamed: 0,a,b,c,name
0,x,y,,tizio
1,,y,z,caio
2,w,,,sempronio


The output I want:


Unnamed: 0,a,b,c,name
0,x,y,z,tizio
1,x,y,z,caio
2,w,,,sempronio


Output of my attempt:


Unnamed: 0,a,b,c


In [227]:
column_names = ['EPPO_CODE','EFSA_PHT','NCBI_TAXID','name']
df_sample = pd.DataFrame([['x','y',None,'tizio'],[None,'y','z','caio'],['w',None,None,'sempronio']], columns=column_names)
#display(df_sample)
## Copy NCBI info to all EFSA terms representing the same entity (same EFSA PHT)

cc = ['EPPO_CODE','EFSA_PHT','NCBI_TAXID']
indices = list(range(len(cc)))

#for _ in indices:
for col_i in indices:
    empty_col_indices = indices.copy()
    empty_col_indices.remove(col_i)
    
    ref_col_name = cc[col_i]
    blank_col_names = [cc[x] for x in empty_col_indices] 
#dbg(ref_col_name, '\nNEW ITERATION: ')
    ## Iterate over final db lines 
    for final_db_i in tqdm(range(len(final_db))):
#print('extracting info from row:', final_db_i)
        ## check if row has a value in the reference column 
        ref_col_value = final_db.at[final_db_i,ref_col_name]
#dbg(ref_col_value)
        if ref_col_value:
            ## store id values from other columns
            id_values = {col_name:final_db.at[final_db_i,col_name] for col_name in blank_col_names}
#dbg(id_values, ref_col_value)
#print('all rows with',ref_col_value,'will be completed with',list(id_values.values()))
            for edit_db_i in (range(len(final_db))):
                for col_name in blank_col_names:
#dbg(col_name, 'column')
#dbg(edit_db_i, 'row')
                    if not final_db.at[edit_db_i,col_name] and final_db.at[edit_db_i,ref_col_name]==ref_col_value:
#print('EDITING')
#print(col_name,'none, replace with',id_values[col_name])
                        final_db.at[edit_db_i,col_name] = id_values[col_name]
final_db

100%|██████████| 2164/2164 [01:07<00:00, 31.89it/s]
100%|██████████| 2164/2164 [01:04<00:00, 33.73it/s]
100%|██████████| 2164/2164 [00:41<00:00, 51.81it/s] 


Unnamed: 0,EPPO_CODE,EFSA_PHT,NCBI_TAXID,preferred_name,synonym,re,ds_language,match_language
0,LIBEAF,LiberibacterAfricanusAsiaticusAmericanus-PHT,ncbi:34020,'Candidatus Liberibacter africanus',greening of citrus,greening of citrus,en,
1,LIBEAF,LiberibacterAfricanusAsiaticusAmericanus-PHT,ncbi:34020,'Candidatus Liberibacter africanus',greening des agrumes,greening des agrumes,fr,
2,LIBEAF,LiberibacterAfricanusAsiaticusAmericanus-PHT,ncbi:34020,'Candidatus Liberibacter africanus',virescence des agrumes,vir\wscence des agrumes,fr,
3,LIBEAF,LiberibacterAfricanusAsiaticusAmericanus-PHT,ncbi:34020,'Candidatus Liberibacter africanus',enverdecimiento de los cítricos,enverdecimiento de los c\wtricos,es,
4,LIBEAM,,,'Candidatus Liberibacter americanus',Brazilian citrus greening,,en,
...,...,...,...,...,...,...,...,...
2159,,AgonoxenaArgaula-PHT,,,Tomato leaf curl New Delhi begomovirus,coconut flat moth\w*,,
2160,,AgonoxenaArgaula-PHT,,,Tomato leaf curl New Delhi begomovirus,coconut leaf miner\w*,,
2161,,AgonoxenaArgaula-PHT,,,Tomato leaf curl New Delhi begomovirus,papillon\w* des spathes du cocotier,,
2162,,AgonoxenaArgaula-PHT,,,Tomato leaf curl New Delhi begomovirus,polilla plana del coco,,


In [228]:
final_db.to_csv(LR_filepath)

In [229]:
sci_name = 'Ralstonia pseudosolanacearum'

prep_ncbi_db[(prep_ncbi_db['synonym']=='bacteria') | (prep_ncbi_db['synonym']=="'"+sci_name+"'")]

Unnamed: 0,NCBI_TAXID,preferred_name,synonym
1,ncbi:2,Bacteria,bacteria
