#  Lexical Resource (LR) for Annotation

#### Imports

In [35]:
# interface
from tqdm import tqdm
from dsutils.de.files import dbg

## Files and filesystem
import os
import json
import csv
import glob

## Data management
import numpy as np
import pandas as pd

## DS tools
import re
from types import NoneType
from dsutils.de.files import describe_csv, get_csv_head, xls_to_csv, get_data_path, get_datafile_path
from dsutils.nlp.language import get_lang_code

#### Paths

In [36]:
## input
db_paths = dict(
    eppo_com = get_datafile_path('original/2022-09-02_COMMONnames_EPPO_OQ.xlsx'),
    eppo_sci = get_datafile_path('original/2022-09-02_SCIENTIFICnames_EPPO_OQ.xlsx'),
    efsa = get_datafile_path('EFSA-keyword-match/FichierMotsClesMagaliLarenaudie.csv'),
    ncbi = get_datafile_path('taxa+id_full.txt'),
)

glossary_paths = dict(
    efsa_glossary = get_datafile_path('efsa_glossary.csv'),
    eppo_glossary = get_datafile_path('eppo_glossary.csv'),
)

## Output
data_path = get_data_path()
full_eppo_path = os.path.join(data_path, 'eppo_glossary.csv')
annotation_path = os.path.join(data_path, 'LRs_for_annotation')
anno_gloss_path = os.path.join(data_path, 'annotation_glossary.csv')
LR_filepath = 'LR.csv'

  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


#### Definitions

Defining functions for preprocessing of original data

In [37]:
def PHT_to_lower_taxon(PHT_code):
    taxon = re.sub(r'(?<!^)(?=[A-Z])', ' ', PHT_code[:-4]).lower()
    return taxon
# print('AcalymmaVittatum-PHT:',PHT_to_lower_taxon('AcalymmaVittatum-PHT')) # test
assert PHT_to_lower_taxon('AcalymmaVittatum-PHT') == 'acalymma vittatum'

def to_pattern(syn):
    r = syn.replace('+', ' ')
    r = r.replace('_', '\\w')
    if r.endswith('%'):
        r = r[:-1] + '\\w*'
    r = r.replace('% ', '\\w* ')
    r = r.replace('%', '\\S*\\s?')
    return r
assert to_pattern(r'chrysom_le%+ray_e+du+concombre') == 'chrysom\wle\w* ray\we du concombre'

#### Parameters

In [38]:
ncbi_sep = '\t'
ncbi_header = ['synonym','taxid','sci_name','tax_path','POS','rank','undefined','also_undefined']
ncbi_ext = '.csv'

db_paths['ncbi'] = db_paths['ncbi'][:-4]+ncbi_ext

## name of the desired coluimns in the LR
anno_gloss_column_names = ['EPPO_CODE', 'EFSA_PHT', 'NCBI_TAXID', # 3 partial keys to relative dataset
                           'preferred_name', 'synonym', 're',# terms
                           'ds_language', 'match_language'] # languages

In [39]:
## column in each of the original datasets corresponding to the output LR column 
cc = dict(
    eppo_com = ['CodeEOPP', None, None,
                'PreferredName', 'CommonName', None,
                ['Language', get_lang_code], None], # [obj (col name), fun] tuple when information is extracted via a function
    eppo_sci = ['CodeEOPP', None, None,
                'PreferredName', 'OtherScientificNames', None,
                None, None],
    efsa = [None, 'Category (pest name)', None,
                ['Category (pest name)', PHT_to_lower_taxon], None,  ['Keywords', to_pattern],
                None, 'Unnamed: 3'],
    ncbi = [None, None, 'taxid',
            'sci_name', 'synonym', None,
            None, None],
    )

#### Check Data

In [40]:
for db_name, path in db_paths.items():
    dbg(len(describe_csv(path).columns))
    if (len(describe_csv(path).columns))==1:
        dbg(path)
        display(describe_csv(path))
        print('starting normalisation:')
        normalised_df = pd.read_csv(path, names=ncbi_header, index_col=0, on_bad_lines='skip', sep=ncbi_sep).reset_index(level=0)
        print('normalised:')
        display(normalised_df.head())
        new_path = path[:-4]+'.csv'
        normalised_df.to_csv(new_path, index=False)
        db_paths[db_name] = new_path
    display(describe_csv(path))

[35m[32mmess: [0m5


Unnamed: 0,CommonName,Language,CodeEOPP,PreferredName,AuthorityPreferredName
count,1276,1276,1324,1324,1142
unique,1235,38,239,239,169
top,citrus leprosis,English,HELIZE,Helicoverpa zea,(Boddie)
freq,4,477,30,30,30


[35m[32mmess: [0m5


Unnamed: 0,OtherScientificNames,Authority,CodeEOPP,PreferredName,AuthorityPreferredName
count,616,453,664,664,502
unique,611,324,239,239,169
top,Tomato leaf curl New Delhi begomovirus,Corbett,BEMITA,Bemisia tabaci,(Gennadius)
freq,2,6,16,16,16


[35m[32m_23: [0m4


Unnamed: 0,Category (pest name),Keywords,Unnamed: 2,Unnamed: 3
count,7558,7558,21,3
unique,1168,7393,5,3
top,Cronartium-PHT,margarodes,Au/Cabi,Fr
freq,85,3,16,1


[35m[32mmess: [0m8


Unnamed: 0,synonym,taxid,sci_name,tax_path,POS,rank,undefined,also_undefined
count,1607725,1607726,1607726,1607726,1607726,1607726,1515574,1515573
unique,1474816,242820,242457,242820,3,40,180141,180140
top,mitosporic Ascomycota,ncbi:1888,Streptomyces albus,/ncbi:1/ncbi:131567/ncbi:2/ncbi:1783272/ncbi:2...,NP,species,ncbi:11320,Influenza A virus
freq,48,532,532,532,1583049,1446117,4350,4350


# Create LR

### (1) Preprocess original datasets

Extract relevant information from each dataset and store it in table form (one `csv` per original dataset)

In [41]:
## Display examples of input datasets
display(pd.read_csv(db_paths['eppo_sci']).describe())
pd.read_csv(db_paths['eppo_com']).describe()

Unnamed: 0,OtherScientificNames,Authority,CodeEOPP,PreferredName,AuthorityPreferredName
count,616,453,664,664,502
unique,611,324,239,239,169
top,Tomato leaf curl New Delhi begomovirus,Corbett,BEMITA,Bemisia tabaci,(Gennadius)
freq,2,6,16,16,16


Unnamed: 0,CommonName,Language,CodeEOPP,PreferredName,AuthorityPreferredName
count,1276,1276,1324,1324,1142
unique,1235,38,239,239,169
top,citrus leprosis,English,HELIZE,Helicoverpa zea,(Boddie)
freq,4,477,30,30,30


In [42]:

db = dict() # create dictionary of databases

for db_name, columns in cc.items(): # loop over db names and respective relevant columns
    db[db_name] = dict()    # new database in db dictionary
    cc_zip = zip(columns, anno_gloss_column_names)  # match source-db column names to final-db column names

    for orig_col, targ_col in cc_zip: # loop over matches

        if isinstance(orig_col, str): # if only column name specified, take column as is
            def fun(x):
                return x
        elif isinstance(orig_col,NoneType): # else, if column must not to be selected, continue
            continue
        else:    # else, if a preprocessing function has been specified, use it when extracting column
            fun = orig_col[1]
            orig_col = orig_col[0]
        # sep = tab (for ncbi)
        ds_col = pd.read_csv(db_paths[db_name])[orig_col] 
            
        db[db_name][targ_col] = list(map(fun, ds_col)) # new_db col = old_db col
    
    pd.DataFrame.from_dict(db[db_name]).to_csv(os.path.join(annotation_path, db_name+'_table.csv'), index=False)


### (2) Join Datasets

In [43]:
prep_dbs = dict()
for db_name in cc.keys():
    prep_dbs[db_name] = pd.read_csv(os.path.join(annotation_path, db_name + '_table.csv'))

#### A concatenated version

In [44]:
len(list(prep_dbs.items()))


4

In [45]:
list(prep_dbs.items())[0][1]

Unnamed: 0,EPPO_CODE,preferred_name,synonym,ds_language
0,LIBEAF,'Candidatus Liberibacter africanus',greening of citrus,en
1,LIBEAF,'Candidatus Liberibacter africanus',greening des agrumes,fr
2,LIBEAF,'Candidatus Liberibacter africanus',virescence des agrumes,fr
3,LIBEAF,'Candidatus Liberibacter africanus',enverdecimiento de los cítricos,es
4,LIBEAM,'Candidatus Liberibacter americanus',Brazilian citrus greening,en
...,...,...,...,...
1319,PHYP64,Grapevine flavescence dorée phytoplasma,rougeau,fr
1320,PHYP64,Grapevine flavescence dorée phytoplasma,flavescencia dorada de la vid,es
1321,PHYP64,Grapevine flavescence dorée phytoplasma,flavescenza dorata de la vite,it
1322,PHYP64,Grapevine flavescence dorée phytoplasma,Фитоплазма золотистого пожелтения винограда,ru


In [57]:
conc_list = list(prep_dbs.values())
conc_list[-1] = conc_list[-1].head()

In [59]:
concat_df = pd.concat(
    conc_list,
    ignore_index=True)
concat_df.to_csv('concatenated_glossary.csv')
len(concat_df)

9551

In [60]:
concat_df.head()

Unnamed: 0,EPPO_CODE,preferred_name,synonym,ds_language,EFSA_PHT,re,match_language,NCBI_TAXID
0,LIBEAF,'Candidatus Liberibacter africanus',greening of citrus,en,,,,
1,LIBEAF,'Candidatus Liberibacter africanus',greening des agrumes,fr,,,,
2,LIBEAF,'Candidatus Liberibacter africanus',virescence des agrumes,fr,,,,
3,LIBEAF,'Candidatus Liberibacter africanus',enverdecimiento de los cítricos,es,,,,
4,LIBEAM,'Candidatus Liberibacter americanus',Brazilian citrus greening,en,,,,


#### Base new DB off NCBI

In [None]:
base_db = prep_dbs['ncbi']

structured_db = base_db.copy()
for col_name in anno_gloss_column_names:
    if col_name not in structured_db.columns:
        structured_db[col_name] = [None]*len(structured_db) # create empty column with desired col name
display(structured_db.head())

Unnamed: 0,NCBI_TAXID,preferred_name,synonym,EPPO_CODE,EFSA_PHT,re,ds_language,match_language
0,ncbi:2,Bacteria.1,Bacteria,,,,,
1,ncbi:2,Bacteria,bacteria,,,,,
2,ncbi:2,Bacteria,eubacteria,,,,,
3,ncbi:2,Bacteria,Monera,,,,,
4,ncbi:2,Bacteria,Procaryotae,,,,,


df1 = pd.DataFrame([[3,4],[5,6]], columns=['a','b'])
df2 = pd.DataFrame([[5,6],[5,6]], columns=['a','c'])
display(df1)
pd.concat([df1,df2], ignore_index=True)

#### Add EPPO info to final DB

In [None]:
db_names = list(str(x) for x in prep_dbs.keys())

In [None]:
# copy into a new db to which eppo information will be added 
ncbi_eppo_db = structured_db.copy()

## open both eppo (sci and com) dbs and concatenate them
prep_eppo_dbs = [prep_dbs[db_name] for db_name in db_names if 'eppo' in db_name]
prep_eppo_db = pd.concat(prep_eppo_dbs, ignore_index=True)
display(prep_eppo_db.head())

Unnamed: 0,EPPO_CODE,preferred_name,synonym,ds_language
0,LIBEAF,'Candidatus Liberibacter africanus',greening of citrus,en
1,LIBEAF,'Candidatus Liberibacter africanus',greening des agrumes,fr
2,LIBEAF,'Candidatus Liberibacter africanus',virescence des agrumes,fr
3,LIBEAF,'Candidatus Liberibacter africanus',enverdecimiento de los cítricos,es
4,LIBEAM,'Candidatus Liberibacter americanus',Brazilian citrus greening,en


In [None]:
## loop over syns in EPPO db
for prep_eppo_db_i in tqdm(range(len(prep_eppo_db))):    
    for ncbi_eppo_db_i in range(len(ncbi_eppo_db)):
        match_found = False
        
        ## if EPPO syn == structured_db.syn: add language, and key to row
        if prep_eppo_db['synonym'][prep_eppo_db_i] == ncbi_eppo_db['synonym'][ncbi_eppo_db_i]:
            ncbi_eppo_db['ds_language'][ncbi_eppo_db_i] = prep_eppo_db['ds_language'][prep_eppo_db_i]
            ncbi_eppo_db['EPPO_CODE'][ncbi_eppo_db_i] = prep_eppo_db['EPPO_CODE'][prep_eppo_db_i]
            match_found = True
            break
    ## else, create new row with: sci_name, syn, lang, key
    if not match_found:
        sci_name = prep_eppo_db['preferred_name'][prep_eppo_db_i]
        syn = prep_eppo_db['synonym'][prep_eppo_db_i]
        lang = prep_eppo_db['ds_language'][prep_eppo_db_i]
        key = prep_eppo_db['EPPO_CODE'][prep_eppo_db_i]
        
        new_eppo_row = pd.DataFrame([[sci_name, syn, lang, key]], columns=['preferred_name', 'synonym', 'ds_language', 'EPPO_CODE'])
        
        ncbi_eppo_db = pd.concat([ncbi_eppo_db, new_eppo_row], ignore_index=True)
    else:
        dbg(prep_eppo_db['preferred_name'][prep_eppo_db_i], len(ncbi_eppo_db))

## loop over sci_names in EPPO db
for prep_eppo_db_i in tqdm(range(len(prep_eppo_db))):
    for ncbi_eppo_db_i in range(len(ncbi_eppo_db)):
        
        ## if EPPO sci_name == structured_db.syn: add language, and key
        if prep_eppo_db['synonym'][prep_eppo_db_i] == ncbi_eppo_db['preferred_name'][ncbi_eppo_db_i]:
            ncbi_eppo_db['ds_language'][ncbi_eppo_db_i] = prep_eppo_db['ds_language'][prep_eppo_db_i]
            ncbi_eppo_db['EPPO_CODE'][ncbi_eppo_db_i] = prep_eppo_db['EPPO_CODE'][prep_eppo_db_i]
    
            match_found = True
            break
    ## else, create new row with: sci_name, syn, lang, key
    if not match_found and prep_eppo_db['preferred_name'][prep_eppo_db_i] not in ncbi_eppo_db['preferred_name']:
        syn = sci_name = prep_eppo_db['preferred_name'][prep_eppo_db_i]
        lang = prep_eppo_db['ds_language'][prep_eppo_db_i]
        key = prep_eppo_db['EPPO_CODE'][prep_eppo_db_i]
        
        new_eppo_row = pd.DataFrame([[sci_name, syn, lang, key]], columns=['preferred_name', 'synonym', 'ds_language', 'EPPO_CODE'])
        
        ncbi_eppo_db = pd.concat([ncbi_eppo_db, new_eppo_row], ignore_index=True)

for i in range(10):
    print('i is:'+str(i))
    for j in range(10):
        print(j)
        if j%5 == 0 and j>0:
            print("break")
            break

#### Add EFSA info to final DB

In [None]:
# copy into a new db to which eppo information will be added 
final_db = ncbi_eppo_db.copy()

## open both eppo (sci and com) dbs and concatenate them
prep_efsa_dbs = [prep_dbs[db_name] for db_name in db_names if 'efsa' in db_name]
prep_efsa_db = pd.concat(prep_efsa_dbs, ignore_index=True)
display(prep_efsa_db.head())

Unnamed: 0,EFSA_PHT,preferred_name,re,match_language
0,AcaloleptaSejuncta-PHT,acalolepta sejuncta,acalolepta sejuncta,
1,AcalymmaVittatum-PHT,acalymma vittatum,acalymma vittata,
2,AcalymmaVittatum-PHT,acalymma vittatum,acalymma vittatum,
3,AcalymmaVittatum-PHT,acalymma vittatum,chrysom\wle\w* ray\we du concombre,Fr
4,AcalymmaVittatum-PHT,acalymma vittatum,cistela melanocephala,


In [None]:
## loop over res i n EFSA db
for prep_efsa_db_i in tqdm(range(len(prep_efsa_db))):
    for final_db_i in range(len(final_db)):
        pattern = prep_efsa_db['re'][prep_efsa_db_i]
        syn = final_db['synonym'][final_db_i]
        
        ## if re matches a syn, add: re, efsa key, language if None
        if re.match(str(pattern), str(syn)):
            final_db['re'][final_db_i] = prep_efsa_db['re'][prep_efsa_db_i]
            final_db['EFSA_PHT'][final_db_i] = prep_efsa_db['EFSA_PHT'][prep_efsa_db_i]
            final_db['match_language'][final_db_i] = prep_efsa_db['match_language'][prep_efsa_db_i]
            match_found = True
        
    ## else, if no matches, add new line = re, syn, key, language
    if not match_found:
        pattern = prep_efsa_db['re'][prep_efsa_db_i]
        #syn = prep_efsa_db['synonym'][prep_efsa_db_i]
        key = prep_efsa_db['EFSA_PHT'][prep_efsa_db_i]
        lang = prep_efsa_db['match_language'][prep_efsa_db_i]
        
        new_efsa_row = pd.DataFrame([[pattern, syn, lang, key]], columns=['re', 'syn', 'match_language', 'EFSA_PHT'])
        
        final_db = pd.concat(final_db, new_efsa_row, ignore_index=True)
            
final_db.head()

  0%|          | 0/7558 [00:00<?, ?it/s]  0%|          | 1/7558 [00:00<16:07,  7.81it/s]  0%|          | 2/7558 [00:00<14:13,  8.85it/s]  0%|          | 3/7558 [00:00<13:46,  9.14it/s]  0%|          | 4/7558 [00:00<13:35,  9.26it/s]  0%|          | 5/7558 [00:00<13:35,  9.26it/s]  0%|          | 6/7558 [00:00<13:20,  9.43it/s]  0%|          | 7/7558 [00:00<13:37,  9.24it/s]  0%|          | 8/7558 [00:00<13:58,  9.00it/s]  0%|          | 9/7558 [00:01<14:15,  8.82it/s]  0%|          | 10/7558 [00:01<14:14,  8.83it/s]  0%|          | 11/7558 [00:01<14:40,  8.57it/s]  0%|          | 12/7558 [00:01<15:38,  8.04it/s]  0%|          | 13/7558 [00:01<15:57,  7.88it/s]  0%|          | 14/7558 [00:01<16:08,  7.79it/s]  0%|          | 15/7558 [00:01<16:34,  7.58it/s]  0%|          | 16/7558 [00:01<16:55,  7.43it/s]  0%|          | 17/7558 [00:02<17:04,  7.36it/s]  0%|          | 18/7558 [00:02<17:15,  7.28it/s]  0%|          | 19/7558 [00:02<17:29,  7.19it/s]  0%|          | 2

Unnamed: 0,NCBI_TAXID,preferred_name,synonym,EPPO_CODE,EFSA_PHT,re,ds_language,match_language
0,ncbi:2,Bacteria.1,Bacteria,,,,,
1,ncbi:2,Bacteria,bacteria,,,,,
2,ncbi:2,Bacteria,eubacteria,,,,,
3,ncbi:2,Bacteria,Monera,,,,,
4,ncbi:2,Bacteria,Procaryotae,,,,,


In [None]:
final_db.to_csv(LR_filepath)