#  Lexical Resource (LR) for Annotation

#### Imports

In [1]:
# interface
from tqdm import tqdm
from dsutils.de.files import dbg

## Files and filesystem
import os
import json
import csv
import glob

## Data management
import numpy as np
import pandas as pd

## DS tools
import re
from types import NoneType
from dsutils.de.files import describe_csv, get_csv_head, xls_to_csv, get_data_path, get_datafile_path
from dsutils.nlp.language import get_lang_code

[nltk_data] Error loading words: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>


#### Paths

In [2]:
## input
db_paths = dict(
    eppo_com = get_datafile_path('2022-09-02_COMMONnames_EPPO_OQ.xlsx'),
    eppo_sci = get_datafile_path('2022-09-02_SCIENTIFICnames_EPPO_OQ.xlsx'),
    efsa = get_datafile_path('EFSA-keyword-match/FichierMotsClesMagaliLarenaudie.csv'),
    ncbi = get_datafile_path('taxa+id_full.txt'),
)

glossary_paths = dict(
    efsa_glossary = get_datafile_path('efsa_glossary.csv'),
    eppo_glossary = get_datafile_path('eppo_glossary.csv'),
)

## Output
data_path = get_data_path()
full_eppo_path = os.path.join(data_path, 'eppo_glossary.csv')
annotation_path = os.path.join(data_path, 'annotation_glossary')
anno_gloss_path = os.path.join(data_path, 'annotation_glossary.csv')
LR_filepath = 'LR.csv'

FileNotFoundError: [Errno 2] No such file or directory: '/home/elubrini/GitHub/bio-corpus-translation/data/2022-09-02_COMMONnames_EPPO_OQ.xlsx'

#### Definitions

Defining functions for preprocessing of original data

In [None]:
def PHT_to_lower_taxon(PHT_code):
    taxon = re.sub(r'(?<!^)(?=[A-Z])', ' ', PHT_code[:-4]).lower()
    return taxon
# print('AcalymmaVittatum-PHT:',PHT_to_lower_taxon('AcalymmaVittatum-PHT')) # test
assert PHT_to_lower_taxon('AcalymmaVittatum-PHT') == 'acalymma vittatum'

def to_pattern(syn):
    r = syn.replace('+', ' ')
    r = r.replace('_', '\\w')
    if r.endswith('%'):
        r = r[:-1] + '\\w*'
    r = r.replace('% ', '\\w* ')
    r = r.replace('%', '\\S*\\s?')
    return r
assert to_pattern(r'chrysom_le%+ray_e+du+concombre') == 'chrysom\wle\w* ray\we du concombre'

#### Parameters

In [None]:
ncbi_sep = '\t'

## name of the desired coluimns in the LR
anno_gloss_column_names = ['EPPO_CODE', 'EFSA_PHT', 'NCBI_TAXID', # 3 partial keys to relative dataset
                           'preferred_name', 'synonym', 're',# terms
                           'ds_language', 'match_language'] # languages

In [None]:
## column in each of the original datasets corresponding to the output LR column 
cc = dict(
    eppo_com = ['CodeEOPP', None, None,
                'PreferredName', 'CommonName', None,
                ['Language', get_lang_code], None], # [obj (col name), fun] tuple when information is extracted via a function
    eppo_sci = ['CodeEOPP', None, None,
                'PreferredName', 'OtherScientificNames', None,
                None, None],
    efsa = [None, 'Category (pest name)', None,
                ['Category (pest name)', PHT_to_lower_taxon], None,  ['Keywords', to_pattern],
                'Unnamed: 3', None],
    ncbi = [None, None, 'ncbi:2',
            'Bacteria.1', 'Bacteria', None,
            None, None],
    )

#### Check Data

In [None]:
## USE SMALLER VERSION OF NCBI
ncbi_small = pd.read_csv(db_paths['ncbi'], on_bad_lines='skip', sep='\t').head(100)
ncbi_small.head()
ncbi_small.to_csv(db_paths['ncbi'])

In [None]:
for path in db_paths.values():
    display(describe_csv(path))

Unnamed: 0,CommonName,Language,CodeEOPP,PreferredName,AuthorityPreferredName
count,1276,1276,1324,1324,1142
unique,1235,38,239,239,169
top,citrus leprosis,English,HELIZE,Helicoverpa zea,(Boddie)
freq,4,477,30,30,30


Unnamed: 0,OtherScientificNames,Authority,CodeEOPP,PreferredName,AuthorityPreferredName
count,616,453,664,664,502
unique,611,324,239,239,169
top,Tomato leaf curl New Delhi begomovirus,Corbett,BEMITA,Bemisia tabaci,(Gennadius)
freq,2,6,16,16,16


Unnamed: 0,Category (pest name),Keywords,Unnamed: 2,Unnamed: 3
count,7558,7558,21,3
unique,1168,7393,5,3
top,Cronartium-PHT,margarodes,Au/Cabi,Fr
freq,85,3,16,1


Unnamed: 0.1,Unnamed: 0
count,100.0
mean,49.5
std,29.011492
min,0.0
25%,24.75
50%,49.5
75%,74.25
max,99.0


In [None]:
pd.read_csv(db_paths['ncbi'])


Unnamed: 0.1,Unnamed: 0,Bacteria,ncbi:2,Bacteria.1,/ncbi:1/ncbi:131567/ncbi:2,NP,superkingdom,Unnamed: 6,Unnamed: 7
0,0,bacteria,ncbi:2,Bacteria,/ncbi:1/ncbi:131567/ncbi:2,NNS,superkingdom,,
1,1,eubacteria,ncbi:2,Bacteria,/ncbi:1/ncbi:131567/ncbi:2,NN,superkingdom,,
2,2,Monera,ncbi:2,Bacteria,/ncbi:1/ncbi:131567/ncbi:2,NP,superkingdom,,
3,3,Procaryotae,ncbi:2,Bacteria,/ncbi:1/ncbi:131567/ncbi:2,NP,superkingdom,,
4,4,Prokaryotae,ncbi:2,Bacteria,/ncbi:1/ncbi:131567/ncbi:2,NP,superkingdom,,


## Create LR

### (1) Preprocess original datasets

Extract relevant information from each dataset and store it in table form (one `csv` per original dataset)

In [None]:
## Display examples of input datasets
display(pd.read_csv(db_paths['eppo_sci']).describe())
pd.read_csv(db_paths['eppo_com']).describe()

Unnamed: 0,OtherScientificNames,Authority,CodeEOPP,PreferredName,AuthorityPreferredName
count,616,453,664,664,502
unique,611,324,239,239,169
top,Tomato leaf curl New Delhi begomovirus,Corbett,BEMITA,Bemisia tabaci,(Gennadius)
freq,2,6,16,16,16


Unnamed: 0,CommonName,Language,CodeEOPP,PreferredName,AuthorityPreferredName
count,1276,1276,1324,1324,1142
unique,1235,38,239,239,169
top,citrus leprosis,English,HELIZE,Helicoverpa zea,(Boddie)
freq,4,477,30,30,30


In [None]:
db = dict()
for db_name, columns in cc.items():
    db[db_name] = dict()
    cc_zip = zip(columns, anno_gloss_column_names)
    for orig_col, targ_col in cc_zip:
        if isinstance(orig_col, str):
            def fun(x):
                return x
        elif isinstance(orig_col,NoneType):
            continue
        else:
            fun = orig_col[1]
            orig_col = orig_col[0]    
            
        # sep = tab (for ncbi)
        if 'ncbi' in db_name:
            ds_col = pd.read_csv(db_paths[db_name], on_bad_lines='skip', sep='\t')[orig_col]
        else:    
            ds_col = pd.read_csv(db_paths[db_name])[orig_col]
            
        db[db_name][targ_col] = list(map(fun, ds_col)) # new_db col = old_db col
    pd.DataFrame.from_dict(db[db_name]).to_csv(os.path.join(annotation_path, db_name+'_table.csv'), index=False)


KeyError: 'ncbi:2'

### (2) Join Datasets

#### Merge, Case Insensitive

In [None]:
annotation_path = os.path.join(data_path, 'annotation_glossary')
all_files = glob.glob(os.path.join(annotation_path, "*.csv"))
print(all_files)
df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
print(df.shape)
df.head()

['/home/elubrini/GitHub/bio-corpus-translation/data/annotation_glossary/re_LR.csv', '/home/elubrini/GitHub/bio-corpus-translation/data/annotation_glossary/eppo_com_table.csv', '/home/elubrini/GitHub/bio-corpus-translation/data/annotation_glossary/eppo_sci_table.csv', '/home/elubrini/GitHub/bio-corpus-translation/data/annotation_glossary/efsa_table.csv', '/home/elubrini/GitHub/bio-corpus-translation/data/annotation_glossary/annotation_LR.csv', '/home/elubrini/GitHub/bio-corpus-translation/data/annotation_glossary/ncbi_table.csv', '/home/elubrini/GitHub/bio-corpus-translation/data/annotation_glossary/LR.csv']


  df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
  df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
  df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)


(17419180, 7)


Unnamed: 0,preferred_name,EPPO_CODE,synonym,ds_language,EFSA_PHT,re,NCBI_TAXID
0,'Abelmoschus esculentus' bunchy top phytoplasma,,'Abelmoschus esculentus' bunchy top phytoplasma,,,,ncbi:926989
1,'Acacia arabica' phytoplasma,,'Acacia arabica' phytoplasma,,,,ncbi:2849181
2,'Acacia farnesiana' leaflet phytoplasma,,'Acacia farnesiana' leaflet phytoplasma,,,,ncbi:2654099
3,'Acacia mangium' little leaf and yellowing phy...,,'Acacia mangium' little leaf and yellowing phy...,,,,ncbi:2530013
4,'Acacia melanoxylon' phytoplasma,,'Acacia melanoxylon' phytoplasma,,,,ncbi:1477896


In [None]:
df = df.groupby(['preferred_name']).last().reset_index().astype(str)
df.shape

(2474128, 7)

In [None]:
EFSA = len((df[df.EFSA_PHT != 'None'].EFSA_PHT))
EPPO = len(((df[df.EPPO_CODE != 'None'].EPPO_CODE)))
EPPO_EFSA = len((df[(df.EPPO_CODE != 'None') & (df.EFSA_PHT != 'None')].EFSA_PHT))
NCBI = len((df[(df.EPPO_CODE != 'None') & (df.EFSA_PHT != 'None')].EFSA_PHT))

dbg(EFSA)
dbg(EPPO)
dbg(EPPO_EFSA)

[35m[32mEFSA: [0m2100
[35m[32mEPPO: [0m239
[35m[32mEPPO_EFSA: [0m164


In [None]:
df.head()

Unnamed: 0,preferred_name,EPPO_CODE,synonym,ds_language,EFSA_PHT,re,NCBI_TAXID
0,'Abelmoschus esculentus' bunchy top phytoplasma,,'Abelmoschus esculentus' bunchy top phytoplasma,,,,ncbi:926989
1,'Acacia arabica' phytoplasma,,'Acacia arabica' phytoplasma,,,,ncbi:2849181
2,'Acacia farnesiana' leaflet phytoplasma,,'Acacia farnesiana' leaflet phytoplasma,,,,ncbi:2654099
3,'Acacia mangium' little leaf and yellowing phy...,,'Acacia mangium' little leaf and yellowing phy...,,,,ncbi:2530013
4,'Acacia melanoxylon' phytoplasma,,'Acacia melanoxylon' phytoplasma,,,,ncbi:1477896


#### Add case information from sibling datasets
Look for elements in list which have case information and add it to lowercase values by replacing them

In [None]:
def find_indices(list_, element):
    indices = [i for i, x in enumerate(list_) if x == element]
    return indices

In [None]:
def add_case_info(mix_list):
    low_list = list(map(lambda x: x.lower(),mix_list))
    match_list = [i==j for i, j in zip(mix_list, low_list)]
    for i in tqdm(range(len(mix_list))):
        if match_list[i]:
            low_term = low_list[i]
            candidates = [t for t in mix_list # candidates for substitution are...
                          if (t.lower()==low_term # (1) the same term as the one being substituted (case insensitive)
                              and not match_list[find_indices(mix_list, t)[0]])] # (2) and not lowercase
            #dbg(mix_list[i])
            mix_list[i] = candidates[0] if len(candidates) else mix_list[i]
            #dbg(mix_list[i])
            #print()
    return mix_list

## test
my_list = ['apple', 'Apple', 'banana', 'BANANA', 'Plum', 'plum', 'berries']
add_case_info(my_list)

  0%|          | 0/7 [00:00<?, ?it/s]100%|██████████| 7/7 [00:00<00:00, 27594.11it/s]


['Apple', 'Apple', 'BANANA', 'BANANA', 'Plum', 'Plum', 'berries']

In [None]:
print(df.shape)
df = df.groupby(['preferred_name']).last().reset_index().astype(str)
print(df.shape)

(2474128, 7)
(2474128, 7)


In [None]:
df['preferred_name'] = add_case_info(df['preferred_name'])

  0%|          | 0/2474128 [00:00<?, ?it/s]  0%|          | 1212/2474128 [00:00<33:23, 1234.22it/s]  0%|          | 1336/2474128 [00:04<2:37:28, 261.72it/s]  9%|▉         | 233962/2474128 [00:04<00:25, 86384.14it/s] 20%|██        | 500181/2474128 [00:04<00:09, 215784.10it/s] 32%|███▏      | 788330/2474128 [00:04<00:04, 394281.17it/s] 44%|████▍     | 1090126/2474128 [00:04<00:02, 623341.04it/s] 57%|█████▋    | 1412106/2474128 [00:04<00:01, 911260.74it/s] 70%|███████   | 1733496/2474128 [00:04<00:00, 1226135.89it/s] 84%|████████▎ | 2067831/2474128 [00:04<00:00, 1572127.16it/s] 84%|████████▎ | 2067831/2474128 [00:23<00:00, 1572127.16it/s] 94%|█████████▎| 2315052/2474128 [00:24<00:03, 46360.12it/s]   94%|█████████▎| 2315052/2474128 [00:43<00:03, 46360.12it/s] 94%|█████████▎| 2315113/2474128 [00:43<00:08, 19138.26it/s] 94%|█████████▎| 2315114/2474128 [00:44<00:08, 18374.63it/s] 94%|█████████▎| 2315114/2474128 [01:03<00:08, 18374.63it/s] 94%|█████████▎| 2315134/2474128 [01:03

In [None]:
df.to_csv(LR_filepath, index=False)
print(df.shape)

NameError: name 'df' is not defined

### Add RegEx information to EPPO and NCBI lines 

In [None]:
df = pd.read_csv(LR_filepath).astype(str)

EFSA = len(set(df[df.EFSA_PHT != 'nan'].EFSA_PHT))
EPPO = len((df[df.EPPO_CODE != 'nan'].EPPO_CODE))
EPPO_EFSA = len((df[(df.EPPO_CODE != 'nan') & (df.EFSA_PHT != 'nan')].EFSA_PHT))

dbg(EFSA)
dbg(EPPO)
dbg(EPPO_EFSA)

In [None]:
## For regex in list of regex:
for pattern, code in tqdm(zip(df['re'], df['EFSA_PHT'])):
    if not isinstance(pattern, str):
        continue
    #try:
    ## For synonym in db:
    for i in range(len(df)):
        ## If there's no regex in line i yet AND there's a match...
        if re.search(str(pattern), str(df['synonym'][i])):   ###df['re'][i] == None and 
            print(df['synonym'][i], pattern)
            df['re'][i] = pattern #...add regex pattern to row in the 're' column 
            df['EFSA_PHT'][i] = code #...add EFSA code to row in the 'EFSA_PHT' column 
            
    #except TypeError:
    #    dbg(type(pattern))

In [None]:
df.to_csv('re_code_'+LR_filepath, index=False)

# Display data

In [None]:
df = pd.read_csv('re_code_'+LR_filepath).astype(str)

EFSA = len(set(df[df.EFSA_PHT != 'nan'].EFSA_PHT))
EPPO = len((df[df.EPPO_CODE != 'nan'].EPPO_CODE))
EPPO_EFSA = len((df[(df.EPPO_CODE != 'nan') & (df.EFSA_PHT != 'nan')].EFSA_PHT))

dbg(EFSA)
dbg(EPPO)
dbg(EPPO_EFSA)

NameError: name 'pd' is not defined

In [None]:
df = df.astype(str)

In [None]:
df[df.EFSA_PHT != 'nan'].head()

Unnamed: 0,preferred_name,EPPO_CODE,synonym,ds_language,EFSA_PHT,re,NCBI_TAXID
124,'Borassus aethiopum' palm lethal yellowing phy...,,'Borassus aethiopum' palm lethal yellowing phy...,,PalmLethalYellowingMycoplasm-PHT,palm lethal yellowing phytoplasma,ncbi:1705292
64409,Aleurocanthus camelliae,,camellia spiny whitefly,,Aleurocanthus-PHT,spiny whitefl\w*,ncbi:1000661
64417,Aleurocanthus spiniferus,ALECSN,citrus spiny whitefly,ja,Aleurocanthus-PHT,spiny whitefl\w*,ncbi:593793
65946,Algerian watermelon mosaic virus,,Algerian watermelon mosaic virus,,WatermelonMosaicVirus-PHT,watermelon mosaic virus,ncbi:515575
85805,American plum line pattern virus,APLPV0,American plum line pattern virus,pt,AmericanPlumLinePatternVirus-PHT,plum line pattern virus,ncbi:134632
