#  Lexical Resource (LR) for Annotation

#### Imports

In [1]:
# interface
from tqdm import tqdm
from dsutils.de.files import dbg

## Files and filesystem
import os
import json
import csv
import glob

## Data management
import numpy as np
import pandas as pd

## DS tools
import re
from types import NoneType
from dsutils.de.files import describe_csv, get_csv_head, xls_to_csv, get_data_path, get_datafile_path
from dsutils.nlp.language import get_lang_code

[nltk_data] Downloading package words to /home/elubrini/nltk_data...
[nltk_data]   Package words is already up-to-date!


#### Paths

In [2]:
## input
db_paths = dict(
    eppo_com = get_datafile_path('original/2022-09-02_COMMONnames_EPPO_OQ.xlsx'),
    eppo_sci = get_datafile_path('original/2022-09-02_SCIENTIFICnames_EPPO_OQ.xlsx'),
    efsa = get_datafile_path('EFSA-keyword-match/FichierMotsClesMagaliLarenaudie.csv'),
    ncbi = get_datafile_path('taxa+id_full.txt'),
)

glossary_paths = dict(
    efsa_glossary = get_datafile_path('efsa_glossary.csv'),
    eppo_glossary = get_datafile_path('eppo_glossary.csv'),
)

## Output
data_path = get_data_path()
full_eppo_path = os.path.join(data_path, 'eppo_glossary.csv')
annotation_path = os.path.join(data_path, 'LRs_for_annotation')
anno_gloss_path = os.path.join(data_path, 'annotation_glossary.csv')
LR_filepath = 'LR.csv'

  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


#### Definitions

Defining functions for preprocessing of original data

In [3]:
def PHT_to_lower_taxon(PHT_code):
    taxon = re.sub(r'(?<!^)(?=[A-Z])', ' ', PHT_code[:-4]).lower()
    return taxon
# print('AcalymmaVittatum-PHT:',PHT_to_lower_taxon('AcalymmaVittatum-PHT')) # test
assert PHT_to_lower_taxon('AcalymmaVittatum-PHT') == 'acalymma vittatum'

def to_pattern(syn):
    r = syn.replace('+', ' ')
    r = r.replace('_', '\\w')
    if r.endswith('%'):
        r = r[:-1] + '\\w*'
    r = r.replace('% ', '\\w* ')
    r = r.replace('%', '\\S*\\s?')
    return r
assert to_pattern(r'chrysom_le%+ray_e+du+concombre') == 'chrysom\wle\w* ray\we du concombre'

#### Parameters

In [4]:
ncbi_sep = '\t'
ncbi_header = ['synonym','taxid','sci_name','tax_path','POS','rank','undefined','also_undefined']

## name of the desired coluimns in the LR
anno_gloss_column_names = ['EPPO_CODE', 'EFSA_PHT', 'NCBI_TAXID', # 3 partial keys to relative dataset
                           'preferred_name', 'synonym', 're',# terms
                           'ds_language', 'match_language'] # languages

In [5]:
## column in each of the original datasets corresponding to the output LR column 
cc = dict(
    eppo_com = ['CodeEOPP', None, None,
                'PreferredName', 'CommonName', None,
                ['Language', get_lang_code], None], # [obj (col name), fun] tuple when information is extracted via a function
    eppo_sci = ['CodeEOPP', None, None,
                'PreferredName', 'OtherScientificNames', None,
                None, None],
    efsa = [None, 'Category (pest name)', None,
                ['Category (pest name)', PHT_to_lower_taxon], None,  ['Keywords', to_pattern],
                None, 'Unnamed: 3'],
    ncbi = [None, None, 'taxid',
            'sci_name', 'synonym', None,
            None, None],
    )

#### Check Data

In [6]:
for path in db_paths.values():
    display(describe_csv(path))
pd.read_csv(db_paths['ncbi'], names=ncbi_header, index_col=0)

Unnamed: 0,CommonName,Language,CodeEOPP,PreferredName,AuthorityPreferredName
count,1276,1276,1324,1324,1142
unique,1235,38,239,239,169
top,citrus leprosis,English,HELIZE,Helicoverpa zea,(Boddie)
freq,4,477,30,30,30


Unnamed: 0,OtherScientificNames,Authority,CodeEOPP,PreferredName,AuthorityPreferredName
count,616,453,664,664,502
unique,611,324,239,239,169
top,Tomato leaf curl New Delhi begomovirus,Corbett,BEMITA,Bemisia tabaci,(Gennadius)
freq,2,6,16,16,16


Unnamed: 0,Category (pest name),Keywords,Unnamed: 2,Unnamed: 3
count,7558,7558,21,3
unique,1168,7393,5,3
top,Cronartium-PHT,margarodes,Au/Cabi,Fr
freq,85,3,16,1


Unnamed: 0.1,Unnamed: 0
count,100.0
mean,49.5
std,29.011492
min,0.0
25%,24.75
50%,49.5
75%,74.25
max,99.0


Unnamed: 0,synonym,taxid,sci_name,tax_path,POS,rank,undefined,also_undefined
,Bacteria,ncbi:2,Bacteria.1,/ncbi:1/ncbi:131567/ncbi:2,NP,superkingdom,Unnamed: 6,Unnamed: 7
0.0,bacteria,ncbi:2,Bacteria,/ncbi:1/ncbi:131567/ncbi:2,NNS,superkingdom,,
1.0,eubacteria,ncbi:2,Bacteria,/ncbi:1/ncbi:131567/ncbi:2,NN,superkingdom,,
2.0,Monera,ncbi:2,Bacteria,/ncbi:1/ncbi:131567/ncbi:2,NP,superkingdom,,
3.0,Procaryotae,ncbi:2,Bacteria,/ncbi:1/ncbi:131567/ncbi:2,NP,superkingdom,,
...,...,...,...,...,...,...,...,...
95.0,Dictyoglomus Saiki et al. 1985,ncbi:13,Dictyoglomus,/ncbi:1/ncbi:131567/ncbi:2/ncbi:68297/ncbi:203...,NP,genus,,
96.0,Dictyoglomus,ncbi:13,Dictyoglomus,/ncbi:1/ncbi:131567/ncbi:2/ncbi:68297/ncbi:203...,NP,genus,,
97.0,Dictyoglomi,ncbi:13,Dictyoglomus,/ncbi:1/ncbi:131567/ncbi:2/ncbi:68297/ncbi:203...,NP,genus,,
98.0,ATCC 35947,ncbi:14,Dictyoglomus thermophilum,/ncbi:1/ncbi:131567/ncbi:2/ncbi:68297/ncbi:203...,NP,species,ncbi:14,Dictyoglomus thermophilum


# Create LR

### (1) Preprocess original datasets

Extract relevant information from each dataset and store it in table form (one `csv` per original dataset)

In [7]:
## Display examples of input datasets
display(pd.read_csv(db_paths['eppo_sci']).describe())
pd.read_csv(db_paths['eppo_com']).describe()

Unnamed: 0,OtherScientificNames,Authority,CodeEOPP,PreferredName,AuthorityPreferredName
count,616,453,664,664,502
unique,611,324,239,239,169
top,Tomato leaf curl New Delhi begomovirus,Corbett,BEMITA,Bemisia tabaci,(Gennadius)
freq,2,6,16,16,16


Unnamed: 0,CommonName,Language,CodeEOPP,PreferredName,AuthorityPreferredName
count,1276,1276,1324,1324,1142
unique,1235,38,239,239,169
top,citrus leprosis,English,HELIZE,Helicoverpa zea,(Boddie)
freq,4,477,30,30,30


In [8]:

db = dict() # create dictionary of databases

for db_name, columns in cc.items(): # loop over db names and respective relevant columns
    db[db_name] = dict()    # new database in db dictionary
    cc_zip = zip(columns, anno_gloss_column_names)  # match source-db column names to final-db column names

    for orig_col, targ_col in cc_zip: # loop over matches

        if isinstance(orig_col, str): # if only column name specified, take column as is
            def fun(x):
                return x
        elif isinstance(orig_col,NoneType): # else, if column must not to be selected, continue
            continue
        else:    # else, if a preprocessing function has been specified, use it when extracting column
            fun = orig_col[1]
            orig_col = orig_col[0]
        # sep = tab (for ncbi)
        if 'ncbi' in db_name:
            ds_col = pd.read_csv(db_paths['ncbi'], names=ncbi_header, index_col=0)[orig_col]
        else:    
            ds_col = pd.read_csv(db_paths[db_name])[orig_col] 
            
        db[db_name][targ_col] = list(map(fun, ds_col)) # new_db col = old_db col
    
    pd.DataFrame.from_dict(db[db_name]).to_csv(os.path.join(annotation_path, db_name+'_table.csv'), index=False)


### (2) Join Datasets

In [9]:
prep_dbs = dict()
for db_name in cc.keys():
    prep_dbs[db_name] = pd.read_csv(os.path.join(annotation_path, db_name + '_table.csv'))

#### Base new DB off NCBI

In [10]:
base_db = prep_dbs['ncbi']

structured_db = base_db.copy()
for col_name in anno_gloss_column_names:
    if col_name not in structured_db.columns:
        structured_db[col_name] = [None]*len(structured_db) # create empty column with desired col name
display(structured_db.head())

Unnamed: 0,NCBI_TAXID,preferred_name,synonym,EPPO_CODE,EFSA_PHT,re,ds_language,match_language
0,ncbi:2,Bacteria.1,Bacteria,,,,,
1,ncbi:2,Bacteria,bacteria,,,,,
2,ncbi:2,Bacteria,eubacteria,,,,,
3,ncbi:2,Bacteria,Monera,,,,,
4,ncbi:2,Bacteria,Procaryotae,,,,,


df1 = pd.DataFrame([[3,4],[5,6]], columns=['a','b'])
df2 = pd.DataFrame([[5,6],[5,6]], columns=['a','c'])
display(df1)
pd.concat([df1,df2], ignore_index=True)

#### Add EPPO info to final DB

In [11]:
db_names = list(str(x) for x in prep_dbs.keys())

In [12]:
# copy into a new db to which eppo information will be added 
ncbi_eppo_db = structured_db.copy()

## open both eppo (sci and com) dbs and concatenate them
prep_eppo_dbs = [prep_dbs[db_name] for db_name in db_names if 'eppo' in db_name]
prep_eppo_db = pd.concat(prep_eppo_dbs, ignore_index=True)
display(prep_eppo_db.head())

Unnamed: 0,EPPO_CODE,preferred_name,synonym,ds_language
0,LIBEAF,'Candidatus Liberibacter africanus',greening of citrus,en
1,LIBEAF,'Candidatus Liberibacter africanus',greening des agrumes,fr
2,LIBEAF,'Candidatus Liberibacter africanus',virescence des agrumes,fr
3,LIBEAF,'Candidatus Liberibacter africanus',enverdecimiento de los cítricos,es
4,LIBEAM,'Candidatus Liberibacter americanus',Brazilian citrus greening,en


In [13]:
## loop over syns in EPPO db
for prep_eppo_db_i in tqdm(range(len(prep_eppo_db))):
    for ncbi_eppo_db_i in range(len(ncbi_eppo_db)):
        match_found = False
        
        ## if EPPO syn == structured_db.syn: add language, and key to row
        if prep_eppo_db['synonym'][prep_eppo_db_i] == ncbi_eppo_db['synonym'][ncbi_eppo_db_i]:
            ncbi_eppo_db['ds_language'][ncbi_eppo_db_i] = prep_eppo_db['ds_language'][prep_eppo_db_i]
            ncbi_eppo_db['EPPO_CODE'][ncbi_eppo_db_i] = prep_eppo_db['EPPO_CODE'][prep_eppo_db_i]
            match_found = True
        
    ## else, create new row with: sci_name, syn, lang, key
    if not match_found:
        sci_name = prep_eppo_db['preferred_name'][prep_eppo_db_i]
        syn = prep_eppo_db['synonym'][prep_eppo_db_i]
        lang = prep_eppo_db['ds_language'][prep_eppo_db_i]
        key = prep_eppo_db['EPPO_CODE'][prep_eppo_db_i]
        
        new_eppo_row = pd.DataFrame([[sci_name, syn, lang, key]], columns=['preferred_name', 'synonym', 'ds_language', 'EPPO_CODE'])
        
        ncbi_eppo_db = pd.concat([ncbi_eppo_db, new_eppo_row], ignore_index=True)
    else:
        dbg(prep_eppo_db['preferred_name'][prep_eppo_db_i], len(ncbi_eppo_db))

## loop over sci_names in EPPO db
for prep_eppo_db_i in tqdm(range(len(prep_eppo_db))):
    for ncbi_eppo_db_i in range(len(ncbi_eppo_db)):
        
        ## if EPPO sci_name == structured_db.syn: add language, and key
        if prep_eppo_db['synonym'][prep_eppo_db_i] == ncbi_eppo_db['preferred_name'][ncbi_eppo_db_i]:
            ncbi_eppo_db['ds_language'][ncbi_eppo_db_i] = prep_eppo_db['ds_language'][prep_eppo_db_i]
            ncbi_eppo_db['EPPO_CODE'][ncbi_eppo_db_i] = prep_eppo_db['EPPO_CODE'][prep_eppo_db_i]
    
            match_found = True
        
    ## else, create new row with: sci_name, syn, lang, key
    if not match_found and prep_eppo_db['preferred_name'][prep_eppo_db_i] not in ncbi_eppo_db['preferred_name']:
        syn = sci_name = prep_eppo_db['preferred_name'][prep_eppo_db_i]
        lang = prep_eppo_db['ds_language'][prep_eppo_db_i]
        key = prep_eppo_db['EPPO_CODE'][prep_eppo_db_i]
        
        new_eppo_row = pd.DataFrame([[sci_name, syn, lang, key]], columns=['preferred_name', 'synonym', 'ds_language', 'EPPO_CODE'])
        
        ncbi_eppo_db = pd.concat([ncbi_eppo_db, new_eppo_row], ignore_index=True)

  0%|          | 0/1988 [00:00<?, ?it/s]  0%|          | 9/1988 [00:00<00:23, 84.52it/s]  1%|          | 18/1988 [00:00<00:24, 79.83it/s]  1%|▏         | 27/1988 [00:00<00:25, 77.35it/s]  2%|▏         | 35/1988 [00:00<00:26, 74.54it/s]  2%|▏         | 43/1988 [00:00<00:27, 72.00it/s]  3%|▎         | 51/1988 [00:00<00:27, 71.15it/s]  3%|▎         | 59/1988 [00:00<00:27, 69.08it/s]  3%|▎         | 66/1988 [00:00<00:28, 66.62it/s]  4%|▎         | 73/1988 [00:01<00:29, 65.09it/s]  4%|▍         | 80/1988 [00:01<00:29, 64.54it/s]  4%|▍         | 87/1988 [00:01<00:30, 62.09it/s]  5%|▍         | 94/1988 [00:01<00:32, 59.13it/s]  5%|▌         | 101/1988 [00:01<00:31, 60.43it/s]  5%|▌         | 108/1988 [00:01<00:30, 61.59it/s]  6%|▌         | 115/1988 [00:01<00:32, 58.32it/s]  6%|▌         | 122/1988 [00:01<00:31, 58.93it/s]  6%|▋         | 128/1988 [00:01<00:31, 58.18it/s]  7%|▋         | 134/1988 [00:02<00:36, 50.29it/s]  7%|▋         | 140/1988 [00:02<00:36, 49.97it/s]  7

[35m540[32msci_name: [0mBemisia tabaci


 23%|██▎       | 449/1988 [00:10<00:43, 35.75it/s] 23%|██▎       | 453/1988 [00:10<00:43, 35.40it/s] 23%|██▎       | 457/1988 [00:10<00:44, 34.67it/s] 23%|██▎       | 461/1988 [00:10<00:47, 32.48it/s] 23%|██▎       | 465/1988 [00:10<00:47, 32.11it/s] 24%|██▎       | 469/1988 [00:10<00:47, 31.96it/s] 24%|██▍       | 473/1988 [00:10<00:49, 30.81it/s] 24%|██▍       | 477/1988 [00:11<00:49, 30.35it/s] 24%|██▍       | 481/1988 [00:11<00:50, 29.68it/s] 24%|██▍       | 485/1988 [00:11<00:49, 30.38it/s] 25%|██▍       | 489/1988 [00:11<00:48, 30.82it/s] 25%|██▍       | 493/1988 [00:11<00:48, 30.91it/s] 25%|██▌       | 497/1988 [00:11<00:48, 30.85it/s] 25%|██▌       | 501/1988 [00:11<00:47, 31.05it/s] 25%|██▌       | 505/1988 [00:12<00:48, 30.73it/s] 26%|██▌       | 509/1988 [00:12<00:48, 30.43it/s] 26%|██▌       | 513/1988 [00:12<00:48, 30.21it/s] 26%|██▌       | 517/1988 [00:12<00:48, 30.06it/s] 26%|██▌       | 521/1988 [00:12<00:48, 30.20it/s] 26%|██▋       | 525/1988 [00:1

[35m640[32msci_name: [0mHelicoverpa zea


 28%|██▊       | 548/1988 [00:13<00:53, 26.92it/s] 28%|██▊       | 551/1988 [00:13<00:54, 26.52it/s] 28%|██▊       | 554/1988 [00:13<00:55, 26.04it/s] 28%|██▊       | 557/1988 [00:13<00:55, 25.68it/s] 28%|██▊       | 560/1988 [00:14<00:56, 25.50it/s] 28%|██▊       | 563/1988 [00:14<00:55, 25.48it/s] 28%|██▊       | 566/1988 [00:14<00:55, 25.46it/s] 29%|██▊       | 569/1988 [00:14<00:55, 25.43it/s] 29%|██▉       | 572/1988 [00:14<00:56, 24.99it/s] 29%|██▉       | 575/1988 [00:14<00:57, 24.73it/s] 29%|██▉       | 578/1988 [00:14<00:56, 24.84it/s] 29%|██▉       | 581/1988 [00:14<00:57, 24.64it/s] 29%|██▉       | 584/1988 [00:14<00:57, 24.54it/s] 30%|██▉       | 587/1988 [00:15<00:57, 24.46it/s] 30%|██▉       | 590/1988 [00:15<00:55, 25.17it/s] 30%|██▉       | 593/1988 [00:15<00:56, 24.50it/s]

[35m688[32msci_name: [0mMargarodes vitis


 30%|██▉       | 596/1988 [00:15<00:59, 23.46it/s] 30%|███       | 599/1988 [00:15<01:01, 22.55it/s] 30%|███       | 602/1988 [00:15<01:01, 22.45it/s] 30%|███       | 605/1988 [00:15<01:04, 21.39it/s] 31%|███       | 608/1988 [00:16<01:05, 21.07it/s] 31%|███       | 611/1988 [00:16<01:03, 21.52it/s] 31%|███       | 614/1988 [00:16<01:03, 21.58it/s] 31%|███       | 617/1988 [00:16<00:59, 22.98it/s] 31%|███       | 621/1988 [00:16<00:53, 25.50it/s]

[35m715[32msci_name: [0mNeoleucinodes elegantalis
[35m716[32msci_name: [0mNeoleucinodes elegantalis


 31%|███▏      | 624/1988 [00:16<00:52, 25.80it/s] 32%|███▏      | 627/1988 [00:16<00:52, 25.88it/s] 32%|███▏      | 630/1988 [00:16<00:52, 25.99it/s] 32%|███▏      | 633/1988 [00:17<00:51, 26.23it/s] 32%|███▏      | 636/1988 [00:17<00:51, 26.31it/s] 32%|███▏      | 639/1988 [00:17<00:51, 26.26it/s] 32%|███▏      | 642/1988 [00:17<00:51, 26.19it/s] 32%|███▏      | 645/1988 [00:17<00:51, 26.12it/s] 33%|███▎      | 648/1988 [00:17<00:51, 25.92it/s] 33%|███▎      | 651/1988 [00:17<00:51, 25.90it/s] 33%|███▎      | 654/1988 [00:17<00:51, 25.84it/s] 33%|███▎      | 657/1988 [00:17<00:51, 25.94it/s] 33%|███▎      | 660/1988 [00:18<00:51, 25.91it/s] 33%|███▎      | 664/1988 [00:18<00:48, 27.58it/s] 34%|███▎      | 668/1988 [00:18<00:44, 29.64it/s]

[35m757[32mmess: [0mPseudopityophthorus pruinosus
[35m761[32msci_name: [0mRhynchophorus palmarum


 34%|███▍      | 671/1988 [00:18<00:44, 29.63it/s] 34%|███▍      | 674/1988 [00:18<00:44, 29.44it/s] 34%|███▍      | 678/1988 [00:18<00:42, 30.90it/s] 34%|███▍      | 682/1988 [00:18<00:40, 31.98it/s]

[35m770[32msci_name: [0mRhynchophorus palmarum


 35%|███▍      | 686/1988 [00:18<00:40, 32.49it/s] 35%|███▍      | 690/1988 [00:18<00:39, 32.51it/s] 35%|███▍      | 694/1988 [00:19<00:39, 32.87it/s] 35%|███▌      | 698/1988 [00:19<00:38, 33.62it/s] 35%|███▌      | 702/1988 [00:19<00:37, 34.22it/s] 36%|███▌      | 706/1988 [00:19<00:36, 34.75it/s] 36%|███▌      | 710/1988 [00:19<00:36, 34.57it/s] 36%|███▌      | 714/1988 [00:19<00:36, 34.56it/s] 36%|███▌      | 718/1988 [00:19<00:36, 35.15it/s] 36%|███▋      | 722/1988 [00:19<00:35, 35.66it/s] 37%|███▋      | 726/1988 [00:20<00:35, 35.87it/s] 37%|███▋      | 730/1988 [00:20<00:35, 35.63it/s] 37%|███▋      | 734/1988 [00:20<00:35, 35.65it/s] 37%|███▋      | 738/1988 [00:20<00:36, 34.08it/s] 37%|███▋      | 742/1988 [00:20<00:41, 30.31it/s] 38%|███▊      | 746/1988 [00:20<00:44, 27.92it/s] 38%|███▊      | 749/1988 [00:20<00:46, 26.67it/s] 38%|███▊      | 752/1988 [00:20<00:52, 23.52it/s] 38%|███▊      | 755/1988 [00:21<00:51, 24.12it/s] 38%|███▊      | 759/1988 [00:2

[35m869[32msci_name: [0mAnastrepha fraterculus


 40%|███▉      | 787/1988 [00:21<00:34, 35.10it/s] 40%|███▉      | 791/1988 [00:22<00:34, 34.82it/s] 40%|███▉      | 795/1988 [00:22<00:34, 34.83it/s] 40%|████      | 799/1988 [00:22<00:33, 35.33it/s] 40%|████      | 803/1988 [00:22<00:32, 36.47it/s] 41%|████      | 807/1988 [00:22<00:32, 36.04it/s]

[35m894[32msci_name: [0mBactrocera dorsalis


 41%|████      | 811/1988 [00:22<00:32, 35.69it/s] 41%|████      | 815/1988 [00:22<00:33, 34.91it/s] 41%|████      | 819/1988 [00:22<00:33, 34.93it/s] 41%|████▏     | 823/1988 [00:23<00:32, 35.38it/s] 42%|████▏     | 827/1988 [00:23<00:32, 35.82it/s] 42%|████▏     | 831/1988 [00:23<00:32, 35.55it/s] 42%|████▏     | 835/1988 [00:23<00:32, 35.79it/s] 42%|████▏     | 839/1988 [00:23<00:32, 35.73it/s] 42%|████▏     | 843/1988 [00:23<00:32, 35.22it/s] 43%|████▎     | 847/1988 [00:23<00:32, 35.14it/s] 43%|████▎     | 851/1988 [00:23<00:32, 35.20it/s] 43%|████▎     | 855/1988 [00:23<00:32, 35.11it/s] 43%|████▎     | 859/1988 [00:24<00:32, 35.03it/s] 43%|████▎     | 863/1988 [00:24<00:32, 34.95it/s] 44%|████▎     | 867/1988 [00:24<00:32, 34.73it/s] 44%|████▍     | 871/1988 [00:24<00:32, 34.80it/s] 44%|████▍     | 875/1988 [00:24<00:32, 34.04it/s] 44%|████▍     | 879/1988 [00:24<00:32, 33.89it/s] 44%|████▍     | 883/1988 [00:24<00:32, 33.76it/s] 45%|████▍     | 887/1988 [00:2

[35m1063[32mmess: [0mCitrus leprosis virus C2
[35m1063[32mmess: [0mHibiscus green spot virus 2


 49%|████▉     | 983/1988 [00:27<00:31, 32.39it/s] 50%|████▉     | 987/1988 [00:28<00:30, 32.43it/s] 50%|████▉     | 991/1988 [00:28<00:30, 32.53it/s] 50%|█████     | 995/1988 [00:28<00:30, 32.98it/s] 50%|█████     | 999/1988 [00:28<00:29, 33.01it/s] 50%|█████     | 1003/1988 [00:28<00:29, 32.92it/s] 51%|█████     | 1007/1988 [00:28<00:29, 32.88it/s] 51%|█████     | 1011/1988 [00:28<00:29, 32.74it/s] 51%|█████     | 1015/1988 [00:28<00:29, 32.87it/s] 51%|█████▏    | 1019/1988 [00:29<00:29, 32.90it/s] 51%|█████▏    | 1023/1988 [00:29<00:29, 32.61it/s] 52%|█████▏    | 1027/1988 [00:29<00:29, 32.58it/s] 52%|█████▏    | 1031/1988 [00:29<00:29, 32.51it/s] 52%|█████▏    | 1035/1988 [00:29<00:29, 32.59it/s] 52%|█████▏    | 1039/1988 [00:29<00:29, 32.39it/s] 52%|█████▏    | 1043/1988 [00:29<00:29, 32.37it/s] 53%|█████▎    | 1047/1988 [00:29<00:29, 32.14it/s] 53%|█████▎    | 1051/1988 [00:30<00:29, 31.84it/s] 53%|█████▎    | 1055/1988 [00:30<00:30, 30.74it/s] 53%|█████▎    | 

[35m1322[32msci_name: [0mPopillia japonica


 62%|██████▏   | 1240/1988 [00:37<00:29, 25.50it/s] 63%|██████▎   | 1243/1988 [00:37<00:28, 25.85it/s] 63%|██████▎   | 1246/1988 [00:37<00:28, 26.38it/s] 63%|██████▎   | 1249/1988 [00:38<00:27, 26.52it/s] 63%|██████▎   | 1252/1988 [00:38<00:27, 27.08it/s] 63%|██████▎   | 1255/1988 [00:38<00:27, 26.72it/s]

[35m1338[32msci_name: [0mAphis citricidus


 63%|██████▎   | 1258/1988 [00:38<00:27, 26.39it/s] 63%|██████▎   | 1261/1988 [00:38<00:27, 26.05it/s] 64%|██████▎   | 1264/1988 [00:38<00:27, 26.35it/s] 64%|██████▎   | 1267/1988 [00:38<00:27, 26.50it/s] 64%|██████▍   | 1270/1988 [00:38<00:26, 26.66it/s] 64%|██████▍   | 1273/1988 [00:38<00:26, 26.74it/s] 64%|██████▍   | 1276/1988 [00:39<00:26, 27.33it/s] 64%|██████▍   | 1279/1988 [00:39<00:26, 27.23it/s]

[35m1361[32msci_name: [0mBursaphelenchus xylophilus


 64%|██████▍   | 1282/1988 [00:39<00:26, 26.45it/s] 65%|██████▍   | 1285/1988 [00:39<00:26, 26.07it/s] 65%|██████▍   | 1288/1988 [00:39<00:26, 26.25it/s] 65%|██████▍   | 1291/1988 [00:39<00:26, 26.35it/s] 65%|██████▌   | 1294/1988 [00:39<00:26, 26.33it/s] 65%|██████▌   | 1297/1988 [00:39<00:27, 25.45it/s] 65%|██████▌   | 1300/1988 [00:39<00:27, 25.25it/s] 66%|██████▌   | 1303/1988 [00:40<00:26, 25.68it/s] 66%|██████▌   | 1306/1988 [00:40<00:26, 25.87it/s] 66%|██████▌   | 1309/1988 [00:40<00:27, 24.54it/s] 66%|██████▌   | 1312/1988 [00:40<00:31, 21.67it/s] 66%|██████▌   | 1315/1988 [00:40<00:34, 19.54it/s] 66%|██████▋   | 1318/1988 [00:40<00:36, 18.27it/s] 66%|██████▋   | 1320/1988 [00:41<00:36, 18.13it/s] 66%|██████▋   | 1322/1988 [00:41<00:36, 18.44it/s] 67%|██████▋   | 1325/1988 [00:41<00:33, 20.09it/s] 67%|██████▋   | 1328/1988 [00:41<00:30, 21.72it/s] 67%|██████▋   | 1331/1988 [00:41<00:28, 22.96it/s] 67%|██████▋   | 1334/1988 [00:41<00:27, 23.86it/s] 67%|██████▋

#### Add EFSA info to final DB

In [14]:
# copy into a new db to which eppo information will be added 
final_db = ncbi_eppo_db.copy()

## open both eppo (sci and com) dbs and concatenate them
prep_efsa_dbs = [prep_dbs[db_name] for db_name in db_names if 'efsa' in db_name]
prep_efsa_db = pd.concat(prep_efsa_dbs, ignore_index=True)
display(prep_efsa_db.head())

Unnamed: 0,EFSA_PHT,preferred_name,re,match_language
0,AcaloleptaSejuncta-PHT,acalolepta sejuncta,acalolepta sejuncta,
1,AcalymmaVittatum-PHT,acalymma vittatum,acalymma vittata,
2,AcalymmaVittatum-PHT,acalymma vittatum,acalymma vittatum,
3,AcalymmaVittatum-PHT,acalymma vittatum,chrysom\wle\w* ray\we du concombre,Fr
4,AcalymmaVittatum-PHT,acalymma vittatum,cistela melanocephala,


In [15]:
## loop over res i n EFSA db
for prep_efsa_db_i in tqdm(range(len(prep_efsa_db))):
    for final_db_i in range(len(final_db)):
        pattern = prep_efsa_db['re'][prep_efsa_db_i]
        syn = final_db['synonym'][final_db_i]
        
        ## if re matches a syn, add: re, efsa key, language if None
        if re.match(str(pattern), str(syn)):
            final_db['re'][final_db_i] = prep_efsa_db['re'][prep_efsa_db_i]
            final_db['EFSA_PHT'][final_db_i] = prep_efsa_db['EFSA_PHT'][prep_efsa_db_i]
            final_db['match_language'][final_db_i] = prep_efsa_db['match_language'][prep_efsa_db_i]
            match_found = True
        
    ## else, if no matches, add new line = re, syn, key, language
    if not match_found:
        pattern = prep_efsa_db['re'][prep_efsa_db_i]
        #syn = prep_efsa_db['synonym'][prep_efsa_db_i]
        key = prep_efsa_db['EFSA_PHT'][prep_efsa_db_i]
        lang = prep_efsa_db['match_language'][prep_efsa_db_i]
        
        new_efsa_row = pd.DataFrame([[pattern, syn, lang, key]], columns=['re', 'syn', 'match_language', 'EFSA_PHT'])
        
        final_db = pd.concat(final_db, new_efsa_row, ignore_index=True)
            
final_db.head()

  0%|          | 0/7558 [00:00<?, ?it/s]  0%|          | 1/7558 [00:00<16:07,  7.81it/s]  0%|          | 2/7558 [00:00<14:13,  8.85it/s]  0%|          | 3/7558 [00:00<13:46,  9.14it/s]  0%|          | 4/7558 [00:00<13:35,  9.26it/s]  0%|          | 5/7558 [00:00<13:35,  9.26it/s]  0%|          | 6/7558 [00:00<13:20,  9.43it/s]  0%|          | 7/7558 [00:00<13:37,  9.24it/s]  0%|          | 8/7558 [00:00<13:58,  9.00it/s]  0%|          | 9/7558 [00:01<14:15,  8.82it/s]  0%|          | 10/7558 [00:01<14:14,  8.83it/s]  0%|          | 11/7558 [00:01<14:40,  8.57it/s]  0%|          | 12/7558 [00:01<15:38,  8.04it/s]  0%|          | 13/7558 [00:01<15:57,  7.88it/s]  0%|          | 14/7558 [00:01<16:08,  7.79it/s]  0%|          | 15/7558 [00:01<16:34,  7.58it/s]  0%|          | 16/7558 [00:01<16:55,  7.43it/s]  0%|          | 17/7558 [00:02<17:04,  7.36it/s]  0%|          | 18/7558 [00:02<17:15,  7.28it/s]  0%|          | 19/7558 [00:02<17:29,  7.19it/s]  0%|          | 2

Unnamed: 0,NCBI_TAXID,preferred_name,synonym,EPPO_CODE,EFSA_PHT,re,ds_language,match_language
0,ncbi:2,Bacteria.1,Bacteria,,,,,
1,ncbi:2,Bacteria,bacteria,,,,,
2,ncbi:2,Bacteria,eubacteria,,,,,
3,ncbi:2,Bacteria,Monera,,,,,
4,ncbi:2,Bacteria,Procaryotae,,,,,


In [16]:
final_db.to_csv(LR_filepath)