In [1]:
SAMPLE = False
SAMPLE_SUFFIX = '_dummy' # for input and output files when sample mode is active

#  Lexical Resource (LR) for output

#### Imports

In [2]:
# interface
from tqdm import tqdm
from dsutils.de.files import dbg

## Files and filesystem
import os
import json
import csv
import glob

## Data management
import numpy as np
import pandas as pd

## DS tools
import re
from types import NoneType
from dsutils.de.files import describe_csv, get_csv_head, xls_to_csv, get_data_path, get_datafile_path
from dsutils.nlp.language import get_lang_code

[nltk_data] Downloading package words to /home/elubrini/nltk_data...
[nltk_data]   Package words is already up-to-date!


#### Paths

In [3]:
## input
db_paths = dict(
    eppo_com = get_datafile_path('original/2022-09-02_COMMONnames_EPPO_OQ.csv'),
    eppo_sci = get_datafile_path('original/2022-09-02_SCIENTIFICnames_EPPO_OQ.csv'),
    efsa = get_datafile_path('efsa_keyword_match/FichierMotsClesMagaliLarenaudie.csv'),
    ncbi = get_datafile_path('original/taxa+id_full.txt'), # or taxa+id_microbes+insects.txt
)

glossary_paths = dict(
    efsa_glossary = get_datafile_path('efsa_glossary.csv'),
    eppo_glossary = get_datafile_path('eppo_glossary.csv'),
)

## Output
data_path = get_data_path()
full_eppo_path = os.path.join(data_path, 'eppo_glossary.csv')
output_path = os.path.join(data_path, 'output')
output_gloss_path = os.path.join(data_path, 'output_glossary.csv')
LR_filepath = 'LR.csv'

#### Definitions

Defining functions for preprocessing of original data

In [4]:
def PHT_to_lower_taxon(PHT_code):
    taxon = re.sub(r'(?<!^)(?=[A-Z])', ' ', PHT_code[:-4]).lower()
    return taxon
# print('AcalymmaVittatum-PHT:',PHT_to_lower_taxon('AcalymmaVittatum-PHT')) # test
assert PHT_to_lower_taxon('AcalymmaVittatum-PHT') == 'acalymma vittatum'

def to_pattern(syn):
    r = syn.replace('+', ' ')
    r = r.replace('_', '\\w')
    if r.endswith('%'):
        r = r[:-1] + '\\w*'
    r = r.replace('% ', '\\w* ')
    r = r.replace('%', '\\S*\\s?')
    return r
assert to_pattern(r'chrysom_le%+ray_e+du+concombre') == 'chrysom\wle\w* ray\we du concombre'

def fill_values_cross_db(main_df,
                        lookup_df,
                        matching_columns = ('',''),
                        lookup_col = ''
                        ):
    if lookup_col not in main_df.columns:
        main_df[lookup_col] = ['']*len(main_df)
     
    for row_i in range(len(main_df)):
        main_df_key = main_df.at[row_i,matching_columns[0]]
        for lookup_row_i in range(len(lookup_df)):
            lookup_df_key = lookup_df.at[lookup_row_i,matching_columns[1]]
            if len(main_df_key) and len(lookup_df_key):
                if main_df_key == lookup_df_key or re.fullmatch(main_df_key, lookup_df_key):
                    cell_value = lookup_df.at[lookup_row_i,lookup_col]
                    main_df.at[row_i,lookup_col] = cell_value
    
    return main_df

#### Parameters

In [5]:
## parameters to read ncbi file
ncbi_sep = '\t'
ncbi_header = ['synonym','taxid','sci_name','tax_path','POS','rank','_', '__']

## which ncbi file to use (specifying extension)
ncbi_ext = '.csv'
db_paths['ncbi'] = db_paths['ncbi'][:-(len(ncbi_ext))]+ncbi_ext

## use a shorter sample file for testing 
if SAMPLE:
    for k in db_paths.keys():
        db_paths[k] = db_paths[k][:-4]+SAMPLE_SUFFIX+db_paths[k][-4:]
    LR_filepath = LR_filepath[:-4]+SAMPLE_SUFFIX+LR_filepath[-4:]

## name of the desired coluimns in the output LR
output_gloss_column_names = ['EPPO_CODE', 'EFSA_PHT', 'NCBI_TAXID', # 3 partial keys to relative dataset
                           'preferred_name', 'synonym', 're',# terms
                           'ds_language', 'match_language'] # languages

In [6]:
## column in each of the original datasets corresponding to the output LR column 
db_names_to_relevant_col_names = dict(
    eppo_com = ['CodeEOPP', None, None,
                'PreferredName', 'CommonName', None,
                ['Language', get_lang_code], None], # [obj (col name), fun] tuple when information is extracted via a function
    eppo_sci = ['CodeEOPP', None, None,
                'PreferredName', 'OtherScientificNames', None,
                None, None],
    efsa = [None, 'Category (pest name)', None,
                ['Category (pest name)', PHT_to_lower_taxon], None,  ['Keywords', to_pattern],
                None, 'Unnamed: 3'],
    ncbi = [None, None, 'taxid',
            'sci_name', 'synonym', None,
            None, None],
    )

#### Check Data

In [7]:
db_paths['eppo_com']

'/home/elubrini/GitHub/bio-corpus-translation/data/original/2022-09-02_COMMONnames_EPPO_OQ.csv'

In [8]:
for db_name, path in db_paths.items():
    dbg(len(describe_csv(path).columns))
    if (len(describe_csv(path).columns))<=2:
        dbg(path)
        display(describe_csv(path))
        print('starting csv format standardisation:')
        normalised_df = pd.read_csv(path, names=ncbi_header, index_col=0, on_bad_lines='skip', sep=ncbi_sep, keep_default_na=False).reset_index(level=0)
        print('standardised:')
        display(normalised_df.head())
        new_path = path[:-4]+'.csv'
        
        normalised_df.to_csv(new_path, index=False,)
        db_paths[db_name] = new_path
    display(pd.read_csv(path, keep_default_na=False).head())

[35m[32mmess: [0m5


Unnamed: 0,CommonName,Language,CodeEOPP,PreferredName,AuthorityPreferredName
0,greening of citrus,English,LIBEAF,'Candidatus Liberibacter africanus',"Jagoueix, Bové & Garnier"
1,greening des agrumes,French,LIBEAF,'Candidatus Liberibacter africanus',"Jagoueix, Bové & Garnier"
2,virescence des agrumes,French,LIBEAF,'Candidatus Liberibacter africanus',"Jagoueix, Bové & Garnier"
3,enverdecimiento de los cítricos,Spanish,LIBEAF,'Candidatus Liberibacter africanus',"Jagoueix, Bové & Garnier"
4,Brazilian citrus greening,English,LIBEAM,'Candidatus Liberibacter americanus',"Teixeira, Saillard, Eveillard, Danet, da Costa..."


[35m[32mmess: [0m5


Unnamed: 0,OtherScientificNames,Authority,CodeEOPP,PreferredName,AuthorityPreferredName
0,Citrus greening bacterium (heat-sensitive strain),,LIBEAF,'Candidatus Liberibacter africanus',"Jagoueix, Bové & Garnier"
1,Liberibacter africanum,,LIBEAF,'Candidatus Liberibacter africanus',"Jagoueix, Bové & Garnier"
2,Liberibacter africanus,,LIBEAF,'Candidatus Liberibacter africanus',"Jagoueix, Bové & Garnier"
3,Liberibacter americanus,,LIBEAM,'Candidatus Liberibacter americanus',"Teixeira, Saillard, Eveillard, Danet, da Costa..."
4,Citrus greening bacterium (heat-tolerant strain),,LIBEAS,'Candidatus Liberibacter asiaticus',"Jagoueix, Bové & Garnier"


[35m[32mmess: [0m4


Unnamed: 0,Category (pest name),Keywords,Unnamed: 2,Unnamed: 3
0,AcaloleptaSejuncta-PHT,acalolepta+sejuncta,Au/Cabi,
1,AcalymmaVittatum-PHT,acalymma+vittata,Au/Cabi,
2,AcalymmaVittatum-PHT,acalymma+vittatum,Au/Cabi,
3,AcalymmaVittatum-PHT,chrysom_le%+ray_e+du+concombre,EOL/Cabi,Fr
4,AcalymmaVittatum-PHT,cistela+melanocephala,Au/Cabi,


[35m[32mmess: [0m8


Unnamed: 0,synonym,taxid,sci_name,tax_path,POS,rank,undefined,also_undefined
0,Bacteria,ncbi:2,Bacteria,/ncbi:1/ncbi:131567/ncbi:2,NP,superkingdom,,
1,bacteria,ncbi:2,Bacteria,/ncbi:1/ncbi:131567/ncbi:2,NNS,superkingdom,,
2,eubacteria,ncbi:2,Bacteria,/ncbi:1/ncbi:131567/ncbi:2,NN,superkingdom,,
3,Monera,ncbi:2,Bacteria,/ncbi:1/ncbi:131567/ncbi:2,NP,superkingdom,,
4,Procaryotae,ncbi:2,Bacteria,/ncbi:1/ncbi:131567/ncbi:2,NP,superkingdom,,


# Create LR

### (1) Preprocess original datasets

Extract relevant information from each dataset and store it in table form (one `csv` per original dataset)

#### create dictionary of databases
Each database in the dictionary corresponds to one of the original databases, but containing only the relevant columns, renamed to match the output datatabase

In [9]:
db = dict() 

for db_name, columns in db_names_to_relevant_col_names.items(): # loop over db names and respective relevant columns
    db[db_name] = dict()    # new database (in dict of columns format) in db dictionary
    colum_renames_zip = zip(columns, output_gloss_column_names)  # match source-db column names to final-db column names

    ## iterate through columns in source dbs, preprocess them and use them to populate corresponding preprocessed database
    for orig_col, targ_col in colum_renames_zip: # loop over column renaming pairs
        if isinstance(orig_col, str): # if only column name specified (as opposed to col name + function), take column as is
            def fun(x):
                return x
        elif isinstance(orig_col,NoneType): # else, if column must not be selected, continue
            continue
        else:    # else, if a preprocessing function has been specified, together with column name, use it when extracting column
            fun = orig_col[1]
            orig_col = orig_col[0]
            
        # adjust reading parameters to ncbi format, if needed 
        try:
            ds_col = pd.read_csv(db_paths[db_name], on_bad_lines='skip', keep_default_na=False)[orig_col]
        except:
            ds_col = pd.read_csv(db_paths[db_name], names=ncbi_header, index_col=0, on_bad_lines='skip', sep=ncbi_sep, keep_default_na=False).reset_index(level=0)[orig_col]
        
        ## add column to the corresponding preprocessed db
        db[db_name][targ_col] = list(map(fun, ds_col)) # new_db col = preprocessed old_db col
    
    ## save each db
    pd.DataFrame.from_dict(db[db_name]).to_csv(os.path.join(output_path, db_name+'_table.csv'), index=False)


In [10]:
## Load previously saved datasets 
prep_dbs = dict()

for db_name in db_names_to_relevant_col_names.keys():
    prep_dbs[db_name] = pd.read_csv(os.path.join(output_path, db_name + '_table.csv'), on_bad_lines='skip', keep_default_na=False)

### (2) Link prep dataset terms to NCBI entities

In [11]:
db_names = list(str(x) for x in prep_dbs.keys()) # names of EPPO databases

### (3) Join Preprocessed Datasets

In [12]:
## prep ncbi db entities will be used to complete other database rows 
prep_ncbi_db = prep_dbs['ncbi']

In [13]:
## [EPPO-ONLY STEP] open both eppo (sci and com) dbs and concatenate them
prep_eppo_dbs = [prep_dbs[db_name] for db_name in db_names if 'eppo' in db_name]
prep_eppo_db = pd.concat(prep_eppo_dbs, ignore_index=True).fillna('')
display(prep_eppo_db.head())
len(prep_eppo_db)

## concatenate all dbs with matching name (normally only one in the case of EFSA)
prep_efsa_dbs = [prep_dbs[db_name] for db_name in db_names if 'efsa' in db_name]
prep_efsa_db = pd.concat(prep_efsa_dbs, ignore_index=True).fillna('')
display(prep_efsa_db.head())
len(prep_efsa_db)

Unnamed: 0,EPPO_CODE,preferred_name,synonym,ds_language
0,LIBEAF,'Candidatus Liberibacter africanus',greening of citrus,en
1,LIBEAF,'Candidatus Liberibacter africanus',greening des agrumes,fr
2,LIBEAF,'Candidatus Liberibacter africanus',virescence des agrumes,fr
3,LIBEAF,'Candidatus Liberibacter africanus',enverdecimiento de los cítricos,es
4,LIBEAM,'Candidatus Liberibacter americanus',Brazilian citrus greening,en


Unnamed: 0,EFSA_PHT,preferred_name,re,match_language
0,AcaloleptaSejuncta-PHT,acalolepta sejuncta,acalolepta sejuncta,
1,AcalymmaVittatum-PHT,acalymma vittatum,acalymma vittata,
2,AcalymmaVittatum-PHT,acalymma vittatum,acalymma vittatum,
3,AcalymmaVittatum-PHT,acalymma vittatum,chrysom\wle\w* ray\we du concombre,Fr
4,AcalymmaVittatum-PHT,acalymma vittatum,cistela melanocephala,


7558

In [14]:
ncbi_enriched_dbs = dict(
    efsa = fill_values_cross_db(prep_efsa_db,
                                    prep_ncbi_db,
                                    matching_columns=('re','synonym'),
                                    lookup_col='NCBI_TAXID'
                                    ),

    eppo = fill_values_cross_db(prep_eppo_db,
                                    prep_ncbi_db,
                                    matching_columns=('synonym','synonym'),
                                    lookup_col='NCBI_TAXID'
                                    )
)

In [15]:
for db_name,db in ncbi_enriched_dbs.items():
    db.to_csv(os.path.join(output_path, db_name+'_ncbi_prep.csv'), index=False)
    #prep_dbs[db_name] = db
    display(db)

Unnamed: 0,EFSA_PHT,preferred_name,re,match_language,NCBI_TAXID
0,AcaloleptaSejuncta-PHT,acalolepta sejuncta,acalolepta sejuncta,,
1,AcalymmaVittatum-PHT,acalymma vittatum,acalymma vittata,,
2,AcalymmaVittatum-PHT,acalymma vittatum,acalymma vittatum,,
3,AcalymmaVittatum-PHT,acalymma vittatum,chrysom\wle\w* ray\we du concombre,Fr,
4,AcalymmaVittatum-PHT,acalymma vittatum,cistela melanocephala,,
...,...,...,...,...,...
7553,ZeugodacusDepressus-PHT,zeugodacus depressus,paradacus depressus,,
7554,ZeugodacusDepressus-PHT,zeugodacus depressus,pumkin fruit fly,,
7555,ZeugodacusDepressus-PHT,zeugodacus depressus,zeugodacus depressus,,
7556,ZeugodacusMadhpuri-PHT,zeugodacus madhpuri,zeugodacus madhpuri,,


Unnamed: 0,EPPO_CODE,preferred_name,synonym,ds_language,NCBI_TAXID
0,LIBEAF,'Candidatus Liberibacter africanus',greening of citrus,en,
1,LIBEAF,'Candidatus Liberibacter africanus',greening des agrumes,fr,
2,LIBEAF,'Candidatus Liberibacter africanus',virescence des agrumes,fr,
3,LIBEAF,'Candidatus Liberibacter africanus',enverdecimiento de los cítricos,es,
4,LIBEAM,'Candidatus Liberibacter americanus',Brazilian citrus greening,en,
...,...,...,...,...,...
1983,PHYP64,Grapevine flavescence dorée phytoplasma,,,
1984,TOLCND,Tomato leaf curl New Delhi virus,BGYVV,,
1985,TOLCND,Tomato leaf curl New Delhi virus,Bitter gourd yellow vein virus,,
1986,TOLCND,Tomato leaf curl New Delhi virus,ToLCNDV,,


In [16]:
## check they loaded correctly (e.g. ncbi)
prep_dbs['ncbi'].head()

Unnamed: 0,NCBI_TAXID,preferred_name,synonym
0,ncbi:2,Bacteria,Bacteria
1,ncbi:2,Bacteria,bacteria
2,ncbi:2,Bacteria,eubacteria
3,ncbi:2,Bacteria,Monera
4,ncbi:2,Bacteria,Procaryotae


#### Create empty DF with col names

In [17]:
empty_db = pd.DataFrame(columns=output_gloss_column_names)
display(empty_db)

Unnamed: 0,EPPO_CODE,EFSA_PHT,NCBI_TAXID,preferred_name,synonym,re,ds_language,match_language


#### Add EPPO info to final DB

In [18]:
prep_eppo_db = ncbi_enriched_dbs['eppo']

In [19]:
# copy into a new db to which eppo information will be added 
eppo_final_db = empty_db.copy()

In [20]:
## loop over syns in EPPO db
for prep_eppo_db_i in tqdm(range(len(prep_eppo_db))):
    syn = str(prep_eppo_db['synonym'][prep_eppo_db_i]) 
    lang = prep_eppo_db['ds_language'][prep_eppo_db_i]
    key = prep_eppo_db['EPPO_CODE'][prep_eppo_db_i]
    sci_name = prep_eppo_db['preferred_name'][prep_eppo_db_i]
    taxid = prep_eppo_db['NCBI_TAXID'][prep_eppo_db_i]
    
    ## add collected info from NCBI and add it to row
    new_eppo_row = pd.DataFrame([[taxid, sci_name, syn, lang, key]], columns=['NCBI_TAXID', 'preferred_name', 'synonym', 'ds_language', 'EPPO_CODE'])
    
    ## add row to 
    eppo_final_db = pd.concat([eppo_final_db, new_eppo_row], ignore_index=True).fillna('')

100%|██████████| 1988/1988 [00:08<00:00, 224.50it/s]


In [21]:
eppo_final_db

Unnamed: 0,EPPO_CODE,EFSA_PHT,NCBI_TAXID,preferred_name,synonym,re,ds_language,match_language
0,LIBEAF,,,'Candidatus Liberibacter africanus',greening of citrus,,en,
1,LIBEAF,,,'Candidatus Liberibacter africanus',greening des agrumes,,fr,
2,LIBEAF,,,'Candidatus Liberibacter africanus',virescence des agrumes,,fr,
3,LIBEAF,,,'Candidatus Liberibacter africanus',enverdecimiento de los cítricos,,es,
4,LIBEAM,,,'Candidatus Liberibacter americanus',Brazilian citrus greening,,en,
...,...,...,...,...,...,...,...,...
1983,PHYP64,,,Grapevine flavescence dorée phytoplasma,,,,
1984,TOLCND,,,Tomato leaf curl New Delhi virus,BGYVV,,,
1985,TOLCND,,,Tomato leaf curl New Delhi virus,Bitter gourd yellow vein virus,,,
1986,TOLCND,,,Tomato leaf curl New Delhi virus,ToLCNDV,,,


In [22]:
eppo_final_db.to_csv('output.csv')

#### Add EFSA info to final DB

EFSA is made of reg expression, while EPPO is made of synonyms.
In orfer to add EFSA information, to the database that is already populated with EPPO data, the regexes will be applied to the synonyms. If a match is found, the EFSA data will complete the previously EPPO-only row, else a new row will be created.

In [23]:
# copy into a new db to which eppo information will be added 
eppo_efsa_final_db = eppo_final_db.copy()
display(eppo_efsa_final_db)

prep_efsa_db = ncbi_enriched_dbs['efsa']
display(prep_efsa_db)

Unnamed: 0,EPPO_CODE,EFSA_PHT,NCBI_TAXID,preferred_name,synonym,re,ds_language,match_language
0,LIBEAF,,,'Candidatus Liberibacter africanus',greening of citrus,,en,
1,LIBEAF,,,'Candidatus Liberibacter africanus',greening des agrumes,,fr,
2,LIBEAF,,,'Candidatus Liberibacter africanus',virescence des agrumes,,fr,
3,LIBEAF,,,'Candidatus Liberibacter africanus',enverdecimiento de los cítricos,,es,
4,LIBEAM,,,'Candidatus Liberibacter americanus',Brazilian citrus greening,,en,
...,...,...,...,...,...,...,...,...
1983,PHYP64,,,Grapevine flavescence dorée phytoplasma,,,,
1984,TOLCND,,,Tomato leaf curl New Delhi virus,BGYVV,,,
1985,TOLCND,,,Tomato leaf curl New Delhi virus,Bitter gourd yellow vein virus,,,
1986,TOLCND,,,Tomato leaf curl New Delhi virus,ToLCNDV,,,


Unnamed: 0,EFSA_PHT,preferred_name,re,match_language,NCBI_TAXID
0,AcaloleptaSejuncta-PHT,acalolepta sejuncta,acalolepta sejuncta,,
1,AcalymmaVittatum-PHT,acalymma vittatum,acalymma vittata,,
2,AcalymmaVittatum-PHT,acalymma vittatum,acalymma vittatum,,
3,AcalymmaVittatum-PHT,acalymma vittatum,chrysom\wle\w* ray\we du concombre,Fr,
4,AcalymmaVittatum-PHT,acalymma vittatum,cistela melanocephala,,
...,...,...,...,...,...
7553,ZeugodacusDepressus-PHT,zeugodacus depressus,paradacus depressus,,
7554,ZeugodacusDepressus-PHT,zeugodacus depressus,pumkin fruit fly,,
7555,ZeugodacusDepressus-PHT,zeugodacus depressus,zeugodacus depressus,,
7556,ZeugodacusMadhpuri-PHT,zeugodacus madhpuri,zeugodacus madhpuri,,


In [24]:
prep_efsa_db.keys()

Index(['EFSA_PHT', 'preferred_name', 're', 'match_language', 'NCBI_TAXID'], dtype='object')

In [25]:
## loop over regexes in EFSA db
for prep_efsa_db_i in tqdm(range(len(prep_efsa_db))):
    match_found = False
    
    pattern = prep_efsa_db['re'][prep_efsa_db_i]
    taxid = prep_efsa_db['NCBI_TAXID'][prep_efsa_db_i]
    efsa_pht = prep_efsa_db['EFSA_PHT'][prep_efsa_db_i]
    match_lang =  prep_efsa_db['match_language'][prep_efsa_db_i]
    #synonym = prep_efsa_db['synonym'][prep_efsa_db_i]
    
    ## iterate over final db rows
    for eppo_efsa_final_db_i in range(len(eppo_efsa_final_db)):
        # info to be matched
        eppo_synonym = eppo_efsa_final_db['synonym'][eppo_efsa_final_db_i]
        eppo_taxid = eppo_efsa_final_db['NCBI_TAXID'][eppo_efsa_final_db_i]
        eppo_sci_name = eppo_efsa_final_db['preferred_name'][eppo_efsa_final_db_i]
        
        ## matching conditions
        #match_1 = eppo_synonym and eppo_synonym == synonym
        match_2= re.fullmatch(str(pattern), str(eppo_synonym))
        match_3 = eppo_taxid and taxid == eppo_taxid
    
        ## if re matches, add: re, efsa key, language, taxid
        sci_name = prep_efsa_db['preferred_name'][prep_efsa_db_i]
        
        ## check it's the same entity first      
        #if eppo_sci_name == sci_name:
        if match_2 or match_3:
            
            # assign info to matching row
            eppo_efsa_final_db['re'][eppo_efsa_final_db_i] = pattern
            eppo_efsa_final_db['EFSA_PHT'][eppo_efsa_final_db_i] = prep_efsa_db['EFSA_PHT'][prep_efsa_db_i]
            eppo_efsa_final_db['match_language'][eppo_efsa_final_db_i] = match_lang
            
            eppo_efsa_final_db['NCBI_TAXID'][eppo_efsa_final_db_i] = max(taxid, eppo_taxid)
            eppo_efsa_final_db['synonym'][eppo_efsa_final_db_i] = eppo_synonym
            match_found = True
        
    ## else, if no matches, add new line = re, syn, key, language, taxid
    if not match_found:
        pattern = prep_efsa_db['re'][prep_efsa_db_i]
        
        new_efsa_row = pd.DataFrame([[pattern, match_lang, efsa_pht, taxid]], columns=['re', 'match_language', 'EFSA_PHT', 'NCBI_TAXID'])
        eppo_efsa_final_db = pd.concat([eppo_efsa_final_db, new_efsa_row], ignore_index=True).fillna('')

eppo_efsa_final_db.head(10)

100%|██████████| 7558/7558 [23:57<00:00,  5.26it/s]


Unnamed: 0,EPPO_CODE,EFSA_PHT,NCBI_TAXID,preferred_name,synonym,re,ds_language,match_language
0,LIBEAF,LiberibacterAfricanusAsiaticusAmericanus-PHT,,'Candidatus Liberibacter africanus',greening of citrus,greening of citrus,en,
1,LIBEAF,LiberibacterAfricanusAsiaticusAmericanus-PHT,,'Candidatus Liberibacter africanus',greening des agrumes,greening des agrumes,fr,
2,LIBEAF,LiberibacterAfricanusAsiaticusAmericanus-PHT,,'Candidatus Liberibacter africanus',virescence des agrumes,vir\wscence des agrumes,fr,
3,LIBEAF,LiberibacterAfricanusAsiaticusAmericanus-PHT,,'Candidatus Liberibacter africanus',enverdecimiento de los cítricos,enverdecimiento de los c\wtricos,es,
4,LIBEAM,,,'Candidatus Liberibacter americanus',Brazilian citrus greening,,en,
5,LIBEAS,LiberibacterAfricanusAsiaticusAmericanus-PHT,,'Candidatus Liberibacter asiaticus',blotchy mottle disease of citrus,blotchy mottle disease of citrus,en,
6,LIBEAS,,,'Candidatus Liberibacter asiaticus',citrus dieback,,en,
7,LIBEAS,LiberibacterAfricanusAsiaticusAmericanus-PHT,,'Candidatus Liberibacter asiaticus',decline of citrus,decline of citrus,en,
8,LIBEAS,LiberibacterAfricanusAsiaticusAmericanus-PHT,,'Candidatus Liberibacter asiaticus',greening of citrus,greening of citrus,en,
9,LIBEAS,LiberibacterAfricanusAsiaticusAmericanus-PHT,,'Candidatus Liberibacter asiaticus',huanglongbing,huanglongbin\w*,en,


#### Copy complementary info to all terms representing the same entity

In [26]:
crossed_final_db = eppo_efsa_final_db.copy()

In [27]:
entity_spec_cols = ['EPPO_CODE', 'EFSA_PHT', 'NCBI_TAXID', 'preferred_name']

for col_i in tqdm(range(len(entity_spec_cols))):
    matching_col = entity_spec_cols[col_i]
    filling_cols = entity_spec_cols.copy()
    filling_cols.remove(matching_col)
    
    for i in range(len(crossed_final_db)): # row to be filled
        for j in range(len(crossed_final_db)): # row to take complementary info
            if i == j:
                break
            
            ## if matching values on matching_col (and not empty)
            if crossed_final_db.at[i,matching_col] and crossed_final_db.at[i,matching_col] == crossed_final_db.at[j,matching_col]:
                
                ## fill with values from complementary row
                for filling_col in filling_cols:
                    crossed_final_db.at[i,filling_col] = max(
                        crossed_final_db.at[i,filling_col],
                        crossed_final_db.at[j,filling_col]
                    )
display(crossed_final_db)

Unnamed: 0,EPPO_CODE,EFSA_PHT,NCBI_TAXID,preferred_name,synonym,re,ds_language,match_language
0,LIBEAF,LiberibacterAfricanusAsiaticusAmericanus-PHT,,'Candidatus Liberibacter africanus',greening of citrus,greening of citrus,en,
1,LIBEAF,LiberibacterAfricanusAsiaticusAmericanus-PHT,,'Candidatus Liberibacter africanus',greening des agrumes,greening des agrumes,fr,
2,LIBEAF,LiberibacterAfricanusAsiaticusAmericanus-PHT,,'Candidatus Liberibacter africanus',virescence des agrumes,vir\wscence des agrumes,fr,
3,LIBEAF,LiberibacterAfricanusAsiaticusAmericanus-PHT,,'Candidatus Liberibacter africanus',enverdecimiento de los cítricos,enverdecimiento de los c\wtricos,es,
4,LIBEAM,,,'Candidatus Liberibacter americanus',Brazilian citrus greening,,en,
...,...,...,...,...,...,...,...,...
8667,,ZeugodacusDepressus-PHT,,,,paradacus depressus,,
8668,,ZeugodacusDepressus-PHT,,,,pumkin fruit fly,,
8669,,ZeugodacusDepressus-PHT,,,,zeugodacus depressus,,
8670,,ZeugodacusMadhpuri-PHT,,,,zeugodacus madhpuri,,


In [28]:
crossed_final_db.to_csv(LR_filepath)