In [1]:
SAMPLE = False

#  Lexical Resource (LR) for Annotation

#### Imports

In [2]:
# interface
from tqdm import tqdm
from dsutils.de.files import dbg

## Files and filesystem
import os
import json
import csv
import glob

## Data management
import numpy as np
import pandas as pd

## DS tools
import re
from types import NoneType
from dsutils.de.files import describe_csv, get_csv_head, xls_to_csv, get_data_path, get_datafile_path
from dsutils.nlp.language import get_lang_code

[nltk_data] Downloading package words to /home/elubrini/nltk_data...
[nltk_data]   Package words is already up-to-date!


#### Paths

In [3]:
## input
db_paths = dict(
    eppo_com = get_datafile_path('original/2022-09-02_COMMONnames_EPPO_OQ.csv'),
    eppo_sci = get_datafile_path('original/2022-09-02_SCIENTIFICnames_EPPO_OQ.csv'),
    efsa = get_datafile_path('EFSA-keyword-match/FichierMotsClesMagaliLarenaudie.csv'),
    ncbi = get_datafile_path('taxa+id_full.txt'),
)

glossary_paths = dict(
    efsa_glossary = get_datafile_path('efsa_glossary.csv'),
    eppo_glossary = get_datafile_path('eppo_glossary.csv'),
)

## Output
data_path = get_data_path()
full_eppo_path = os.path.join(data_path, 'eppo_glossary.csv')
annotation_path = os.path.join(data_path, 'LRs_for_annotation')
anno_gloss_path = os.path.join(data_path, 'annotation_glossary.csv')
LR_filepath = 'LR.csv'

#### Definitions

Defining functions for preprocessing of original data

In [4]:
def PHT_to_lower_taxon(PHT_code):
    taxon = re.sub(r'(?<!^)(?=[A-Z])', ' ', PHT_code[:-4]).lower()
    return taxon
# print('AcalymmaVittatum-PHT:',PHT_to_lower_taxon('AcalymmaVittatum-PHT')) # test
assert PHT_to_lower_taxon('AcalymmaVittatum-PHT') == 'acalymma vittatum'

def to_pattern(syn):
    r = syn.replace('+', ' ')
    r = r.replace('_', '\\w')
    if r.endswith('%'):
        r = r[:-1] + '\\w*'
    r = r.replace('% ', '\\w* ')
    r = r.replace('%', '\\S*\\s?')
    return r
assert to_pattern(r'chrysom_le%+ray_e+du+concombre') == 'chrysom\wle\w* ray\we du concombre'

#### Parameters

In [5]:
ncbi_sep = '\t'
ncbi_header = ['synonym','taxid','sci_name','tax_path','POS','rank','undefined','also_undefined']
ncbi_ext = '.csv'

db_paths['ncbi'] = db_paths['ncbi'][:-4]+ncbi_ext
if SAMPLE:
    for k in db_paths.keys():
        db_paths[k] = db_paths[k][:-4]+' copy'+db_paths[k][-4:]
    LR_filepath = LR_filepath[:-4]+'_sample'+LR_filepath[-4:]

## name of the desired coluimns in the LR
anno_gloss_column_names = ['EPPO_CODE', 'EFSA_PHT', 'NCBI_TAXID', # 3 partial keys to relative dataset
                           'preferred_name', 'synonym', 're',# terms
                           'ds_language', 'match_language'] # languages

In [6]:
## column in each of the original datasets corresponding to the output LR column 
cc = dict(
    eppo_com = ['CodeEOPP', None, None,
                'PreferredName', 'CommonName', None,
                ['Language', get_lang_code], None], # [obj (col name), fun] tuple when information is extracted via a function
    eppo_sci = ['CodeEOPP', None, None,
                'PreferredName', 'OtherScientificNames', None,
                None, None],
    efsa = [None, 'Category (pest name)', None,
                ['Category (pest name)', PHT_to_lower_taxon], None,  ['Keywords', to_pattern],
                None, 'Unnamed: 3'],
    ncbi = [None, None, 'taxid',
            'sci_name', 'synonym', None,
            None, None],
    )

#### Check Data

In [7]:
for db_name, path in db_paths.items():
    dbg(len(describe_csv(path).columns))
    if (len(describe_csv(path).columns))==1:
        dbg(path)
        display(describe_csv(path))
        print('starting normalisation:')
        normalised_df = pd.read_csv(path, names=ncbi_header, index_col=0, on_bad_lines='skip', sep=ncbi_sep).reset_index(level=0)
        print('normalised:')
        display(normalised_df.head())
        new_path = path[:-4]+'.csv'
        normalised_df.to_csv(new_path, index=False)
        db_paths[db_name] = new_path
    display(describe_csv(path))

[35m[32mmess: [0m5


Unnamed: 0,CommonName,Language,CodeEOPP,PreferredName,AuthorityPreferredName
count,1276,1276,1324,1324,1142
unique,1235,38,239,239,169
top,citrus leprosis,English,HELIZE,Helicoverpa zea,(Boddie)
freq,4,477,30,30,30


[35m[32mmess: [0m5


Unnamed: 0,OtherScientificNames,Authority,CodeEOPP,PreferredName,AuthorityPreferredName
count,616,453,664,664,502
unique,611,324,239,239,169
top,Tomato leaf curl New Delhi begomovirus,Corbett,BEMITA,Bemisia tabaci,(Gennadius)
freq,2,6,16,16,16


[35m[32mmess: [0m4


Unnamed: 0,Category (pest name),Keywords,Unnamed: 2,Unnamed: 3
count,7558,7558,21,3
unique,1168,7393,5,3
top,Cronartium-PHT,margarodes,Au/Cabi,Fr
freq,85,3,16,1


[35m[32mmess: [0m8


Unnamed: 0,synonym,taxid,sci_name,tax_path,POS,rank,undefined,also_undefined
count,1607725,1607726,1607726,1607726,1607726,1607726,1515574,1515573
unique,1474816,242820,242457,242820,3,40,180141,180140
top,mitosporic Ascomycota,ncbi:1888,Streptomyces albus,/ncbi:1/ncbi:131567/ncbi:2/ncbi:1783272/ncbi:2...,NP,species,ncbi:11320,Influenza A virus
freq,48,532,532,532,1583049,1446117,4350,4350


# Create LR

### (1) Preprocess original datasets

Extract relevant information from each dataset and store it in table form (one `csv` per original dataset)

## Display examples of input datasets
display(pd.read_csv(db_paths['eppo_sci']).describe())
pd.read_csv(db_paths['eppo_com']).describe()

In [7]:

db = dict() # create dictionary of databases

for db_name, columns in cc.items(): # loop over db names and respective relevant columns
    db[db_name] = dict()    # new database in db dictionary
    cc_zip = zip(columns, anno_gloss_column_names)  # match source-db column names to final-db column names

    for orig_col, targ_col in cc_zip: # loop over matches

        if isinstance(orig_col, str): # if only column name specified, take column as is
            def fun(x):
                return x
        elif isinstance(orig_col,NoneType): # else, if column must not to be selected, continue
            continue
        else:    # else, if a preprocessing function has been specified, use it when extracting column
            fun = orig_col[1]
            orig_col = orig_col[0]
        # sep = tab (for ncbi)
        ds_col = pd.read_csv(db_paths[db_name])[orig_col] 
            
        db[db_name][targ_col] = list(map(fun, ds_col)) # new_db col = old_db col
    
    pd.DataFrame.from_dict(db[db_name]).to_csv(os.path.join(annotation_path, db_name+'_table.csv'), index=False)


### (2) Join Datasets

In [8]:
prep_dbs = dict()
for db_name in cc.keys():
    prep_dbs[db_name] = pd.read_csv(os.path.join(annotation_path, db_name + '_table.csv'))

#### A concatenated version

In [9]:
len(list(prep_dbs.items()))

4

In [10]:
list(prep_dbs.items())[0][1]

Unnamed: 0,EPPO_CODE,preferred_name,synonym,ds_language
0,LIBEAF,'Candidatus Liberibacter africanus',greening of citrus,en
1,LIBEAF,'Candidatus Liberibacter africanus',greening des agrumes,fr
2,LIBEAF,'Candidatus Liberibacter africanus',virescence des agrumes,fr
3,LIBEAF,'Candidatus Liberibacter africanus',enverdecimiento de los cítricos,es
4,LIBEAM,'Candidatus Liberibacter americanus',Brazilian citrus greening,en
...,...,...,...,...
1319,PHYP64,Grapevine flavescence dorée phytoplasma,rougeau,fr
1320,PHYP64,Grapevine flavescence dorée phytoplasma,flavescencia dorada de la vid,es
1321,PHYP64,Grapevine flavescence dorée phytoplasma,flavescenza dorata de la vite,it
1322,PHYP64,Grapevine flavescence dorée phytoplasma,Фитоплазма золотистого пожелтения винограда,ru


In [11]:
conc_list = list(prep_dbs.values())
conc_list[-1] = conc_list[-1].head()

In [12]:
concat_df = pd.concat(
    conc_list,
    ignore_index=True)
concat_df.to_csv('concatenated_glossary.csv')
len(concat_df)

9551

In [13]:
concat_df.head()

Unnamed: 0,EPPO_CODE,preferred_name,synonym,ds_language,EFSA_PHT,re,match_language,NCBI_TAXID
0,LIBEAF,'Candidatus Liberibacter africanus',greening of citrus,en,,,,
1,LIBEAF,'Candidatus Liberibacter africanus',greening des agrumes,fr,,,,
2,LIBEAF,'Candidatus Liberibacter africanus',virescence des agrumes,fr,,,,
3,LIBEAF,'Candidatus Liberibacter africanus',enverdecimiento de los cítricos,es,,,,
4,LIBEAM,'Candidatus Liberibacter americanus',Brazilian citrus greening,en,,,,


#### Create empty DF with col names

In [14]:
structured_db = pd.DataFrame(columns=anno_gloss_column_names)
display(structured_db)

Unnamed: 0,EPPO_CODE,EFSA_PHT,NCBI_TAXID,preferred_name,synonym,re,ds_language,match_language


df1 = pd.DataFrame([[3,4],[5,6]], columns=['a','b'])
df2 = pd.DataFrame([[5,6],[5,6]], columns=['a','c'])
display(df1)
pd.concat([df1,df2], ignore_index=True)

#### Add EPPO info to final DB

In [15]:
db_names = list(str(x) for x in prep_dbs.keys()) # names of EPPO databases

In [16]:
# copy into a new db to which eppo information will be added 
ncbi_eppo_db = structured_db.copy()
prep_ncbi_db = prep_dbs['ncbi']

## open both eppo (sci and com) dbs and concatenate them
prep_eppo_dbs = [prep_dbs[db_name] for db_name in db_names if 'eppo' in db_name]
prep_eppo_db = pd.concat(prep_eppo_dbs, ignore_index=True)
display(prep_eppo_db.head())

Unnamed: 0,EPPO_CODE,preferred_name,synonym,ds_language
0,LIBEAF,'Candidatus Liberibacter africanus',greening of citrus,en
1,LIBEAF,'Candidatus Liberibacter africanus',greening des agrumes,fr
2,LIBEAF,'Candidatus Liberibacter africanus',virescence des agrumes,fr
3,LIBEAF,'Candidatus Liberibacter africanus',enverdecimiento de los cítricos,es
4,LIBEAM,'Candidatus Liberibacter americanus',Brazilian citrus greening,en


In [17]:
## loop over syns in EPPO db
for prep_eppo_db_i in tqdm(range(len(prep_eppo_db))):
    syn = str(prep_eppo_db['synonym'][prep_eppo_db_i]) 
    lang = prep_eppo_db['ds_language'][prep_eppo_db_i]
    key = prep_eppo_db['EPPO_CODE'][prep_eppo_db_i]
    sci_name = prep_eppo_db['preferred_name'][prep_eppo_db_i]
    
    prep_ncbi_syn_rows = prep_ncbi_db[prep_ncbi_db['synonym']==syn]
    prep_ncbi_sci_rows = prep_ncbi_db[(prep_ncbi_db['preferred_name']==sci_name) | (prep_ncbi_db['preferred_name']=="'"+sci_name+"'")]
    
    # rows in NCBI that have a matching synonym or scientific name
    prep_ncbi_matching_rows = pd.concat([prep_ncbi_syn_rows, prep_ncbi_sci_rows], ignore_index=True)
    
    taxid_col = (prep_ncbi_matching_rows['NCBI_TAXID'])
    sci_name_col = prep_ncbi_matching_rows['preferred_name']
    
    if len(prep_ncbi_matching_rows) != 0:
        taxid = taxid_col[0]
        sci_name = sci_name_col[0]
    else:
        taxid = ''
        sci_name = prep_eppo_db['preferred_name'][prep_eppo_db_i]
    
    new_eppo_row = pd.DataFrame([[taxid, sci_name, syn, lang, key]], columns=['NCBI_TAXID', 'preferred_name', 'synonym', 'ds_language', 'EPPO_CODE'])
    
    ncbi_eppo_db = pd.concat([ncbi_eppo_db, new_eppo_row], ignore_index=True)

  0%|          | 0/1988 [00:00<?, ?it/s]  0%|          | 1/1988 [00:00<13:46,  2.40it/s]  0%|          | 2/1988 [00:00<13:37,  2.43it/s]  0%|          | 3/1988 [00:01<13:34,  2.44it/s]  0%|          | 4/1988 [00:01<13:34,  2.44it/s]  0%|          | 5/1988 [00:02<13:32,  2.44it/s]  0%|          | 6/1988 [00:02<13:34,  2.43it/s]  0%|          | 7/1988 [00:02<13:33,  2.43it/s]  0%|          | 8/1988 [00:03<13:33,  2.43it/s]  0%|          | 9/1988 [00:03<13:31,  2.44it/s]  1%|          | 10/1988 [00:04<13:31,  2.44it/s]  1%|          | 11/1988 [00:04<13:29,  2.44it/s]  1%|          | 12/1988 [00:04<13:29,  2.44it/s]  1%|          | 13/1988 [00:05<13:26,  2.45it/s]  1%|          | 14/1988 [00:05<13:26,  2.45it/s]  1%|          | 15/1988 [00:06<13:26,  2.45it/s]  1%|          | 16/1988 [00:06<13:26,  2.45it/s]  1%|          | 17/1988 [00:06<13:25,  2.45it/s]  1%|          | 18/1988 [00:07<13:25,  2.45it/s]  1%|          | 19/1988 [00:07<13:23,  2.45it/s]  1%|          | 2

ncbi_eppo_db.to_csv('output.csv')

for i in range(10):
    print('i is:'+str(i))
    for j in range(10):
        print(j)
        if j%5 == 0 and j>0:
            print("break")
            break

#### Add EFSA info to final DB

In [18]:
# copy into a new db to which eppo information will be added 
final_db = ncbi_eppo_db.copy()

## open both eppo (sci and com) dbs and concatenate them
prep_efsa_dbs = [prep_dbs[db_name] for db_name in db_names if 'efsa' in db_name]
prep_efsa_db = pd.concat(prep_efsa_dbs, ignore_index=True)
display(prep_efsa_db.head())
len(prep_efsa_db)

Unnamed: 0,EFSA_PHT,preferred_name,re,match_language
0,AcaloleptaSejuncta-PHT,acalolepta sejuncta,acalolepta sejuncta,
1,AcalymmaVittatum-PHT,acalymma vittatum,acalymma vittata,
2,AcalymmaVittatum-PHT,acalymma vittatum,acalymma vittatum,
3,AcalymmaVittatum-PHT,acalymma vittatum,chrysom\wle\w* ray\we du concombre,Fr
4,AcalymmaVittatum-PHT,acalymma vittatum,cistela melanocephala,


7558

In [19]:
match_found = False
## loop over res i n EFSA db
for prep_efsa_db_i in tqdm(range(len(prep_efsa_db))):
    for final_db_i in range(len(final_db)):
        pattern = prep_efsa_db['re'][prep_efsa_db_i]
        syn = final_db['synonym'][final_db_i]
        
        ## if re matches a syn, add: re, efsa key, language if None
        if re.match(str(pattern), str(syn)):
            final_db['re'][final_db_i] = prep_efsa_db['re'][prep_efsa_db_i]
            final_db['EFSA_PHT'][final_db_i] = prep_efsa_db['EFSA_PHT'][prep_efsa_db_i]
            final_db['match_language'][final_db_i] = prep_efsa_db['match_language'][prep_efsa_db_i]
            match_found = True
        
    ## else, if no matches, add new line = re, syn, key, language
    if not match_found:
        pattern = prep_efsa_db['re'][prep_efsa_db_i]
        #syn = prep_efsa_db['synonym'][prep_efsa_db_i]
        key = prep_efsa_db['EFSA_PHT'][prep_efsa_db_i]
        lang = prep_efsa_db['match_language'][prep_efsa_db_i]
        
        new_efsa_row = pd.DataFrame([[pattern, syn, lang, key]], columns=['re', 'syn', 'match_language', 'EFSA_PHT'])
        
        final_db = pd.concat([final_db, new_efsa_row], ignore_index=True)
            
final_db.head()

  0%|          | 0/7558 [00:00<?, ?it/s]  0%|          | 2/7558 [00:00<07:29, 16.80it/s]  0%|          | 5/7558 [00:00<05:54, 21.29it/s]  0%|          | 8/7558 [00:00<05:18, 23.70it/s]  0%|          | 11/7558 [00:00<05:03, 24.86it/s]  0%|          | 14/7558 [00:00<04:53, 25.68it/s]  0%|          | 17/7558 [00:00<04:47, 26.23it/s]  0%|          | 20/7558 [00:00<04:43, 26.63it/s]  0%|          | 23/7558 [00:00<04:40, 26.82it/s]  0%|          | 26/7558 [00:01<04:39, 26.99it/s]  0%|          | 29/7558 [00:01<04:37, 27.09it/s]  0%|          | 32/7558 [00:01<04:41, 26.77it/s]  0%|          | 35/7558 [00:01<04:39, 26.87it/s]  1%|          | 38/7558 [00:01<04:39, 26.86it/s]  1%|          | 41/7558 [00:01<04:40, 26.79it/s]  1%|          | 44/7558 [00:01<04:40, 26.83it/s]  1%|          | 47/7558 [00:01<04:39, 26.84it/s]  1%|          | 50/7558 [00:01<04:39, 26.83it/s]  1%|          | 53/7558 [00:02<04:39, 26.85it/s]  1%|          | 56/7558 [00:02<04:39, 26.86it/s]  1%|       

Unnamed: 0,EPPO_CODE,EFSA_PHT,NCBI_TAXID,preferred_name,synonym,re,ds_language,match_language,syn
0,LIBEAF,LiberibacterAfricanusAsiaticusAmericanus-PHT,,'Candidatus Liberibacter africanus',greening of citrus,greening of citrus,en,,
1,LIBEAF,LiberibacterAfricanusAsiaticusAmericanus-PHT,,'Candidatus Liberibacter africanus',greening des agrumes,greening des agrumes,fr,,
2,LIBEAF,LiberibacterAfricanusAsiaticusAmericanus-PHT,,'Candidatus Liberibacter africanus',virescence des agrumes,vir\wscence des agrumes,fr,,
3,LIBEAF,LiberibacterAfricanusAsiaticusAmericanus-PHT,,'Candidatus Liberibacter africanus',enverdecimiento de los cítricos,enverdecimiento de los c\wtricos,es,,
4,LIBEAM,,,'Candidatus Liberibacter americanus',Brazilian citrus greening,,en,,


#### add NCBI info in incomplete EFSA lines

In [21]:
## Iterate over lines that have only efsa info
for final_db_i in tqdm(range(len(final_db))):
    pattern = final_db.loc[final_db_i,'re']
    if pattern and pd.isna(final_db.loc[final_db_i,'NCBI_TAXID']) is False:
        for prep_ncbi_db_i in range(len(prep_ncbi_db)):
            syn = prep_ncbi_db.loc[prep_ncbi_db_i,'synonym']
            sci_name = prep_ncbi_db.loc[prep_ncbi_db_i,'preferred_name']
            taxid = prep_ncbi_db.loc[prep_ncbi_db_i,'NCBI_TAXID']
            
            if re.match(str(pattern), str(syn)) or re.match(str(pattern), str(sci_name)):
                final_db.loc[final_db_i,'NCBI_TAXID'] = taxid
                final_db.loc[final_db_i,'preferred_name'] = sci_name
            if re.match(str(pattern), str(syn)):
                final_db.loc[final_db_i,'synonym'] = syn
final_db

  0%|          | 0/2164 [00:00<?, ?it/s]  0%|          | 1/2164 [01:03<38:13:59, 63.63s/it]  0%|          | 2/2164 [02:06<37:58:39, 63.24s/it]  0%|          | 3/2164 [03:09<37:52:37, 63.10s/it]  0%|          | 4/2164 [04:12<37:53:41, 63.16s/it]  0%|          | 5/2164 [05:20<38:46:01, 64.64s/it]  0%|          | 6/2164 [06:24<38:44:39, 64.63s/it]  0%|          | 7/2164 [07:30<39:02:11, 65.15s/it]  0%|          | 8/2164 [08:33<38:34:16, 64.40s/it]  0%|          | 9/2164 [09:36<38:10:32, 63.77s/it]  0%|          | 10/2164 [10:38<37:57:14, 63.43s/it]  1%|          | 11/2164 [11:40<37:39:19, 62.96s/it]  1%|          | 12/2164 [12:43<37:37:51, 62.95s/it]  1%|          | 13/2164 [13:45<37:24:39, 62.61s/it]  1%|          | 14/2164 [14:46<37:10:14, 62.24s/it]  1%|          | 15/2164 [15:50<37:24:13, 62.66s/it]  1%|          | 16/2164 [16:54<37:40:02, 63.13s/it]  1%|          | 17/2164 [17:58<37:41:39, 63.20s/it]  1%|          | 18/2164 [19:02<37:54:10, 63.58s/it]  1%|         

In [None]:
final_db.to_csv(LR_filepath)

In [None]:
sci_name = 'Ralstonia pseudosolanacearum'

prep_ncbi_db[(prep_ncbi_db['synonym']=='bacteria') | (prep_ncbi_db['synonym']=="'"+sci_name+"'")]

Unnamed: 0,NCBI_TAXID,preferred_name,synonym
1,ncbi:2,Bacteria,bacteria
