# Preprocessing

In [1]:
from dsutils.de.files import get_data_path, get_datafile_path, xls_to_csv, sample_csv, get_csv_head
import re
import pandas as pd
import csv
import json
import os
import sys
import collections
from tqdm import tqdm
import numpy as np

In [2]:
TEXT_FIELDS = ('title_extraction', 'abstract_extraction', )

## Input
corpus_path = get_datafile_path('dataset_tiers-esv.csv')
eppo_path = get_datafile_path('2022-09-02_COMMONnames_EPPO_OQ.xlsx')
re_path = get_datafile_path('EFSA-keyword-match/FichierMotsClesMagaliLarenaudie.csv')
ncbi_path = get_datafile_path('taxa+id_full.txt')
eppo_sci_path = get_datafile_path('2022-09-02_SCIENTIFICnames_EPPO_OQ.xlsx')

## Output
data_path = get_data_path()
efsa_match_json_path = os.path.join(data_path, 'efsa_matches.json')
efsa_glossary_path = os.path.join(data_path, 'efsa_glossary.csv')

  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


In [3]:
with open(corpus_path) as f:
    CORPUS = pd.read_csv(f)

In [4]:
#sample_path = sample_csv(corpus_path, in_col_select_rows={'id':['61664']})

#with open(sample_path) as f:
#    SAMPLE_CORPUS = pd.read_csv(f)

## Extract EFSA term lang info

### Match with Corpus Language 

In [5]:
with open(re_path) as f:
    EFSA = pd.read_csv(f, delimiter=',')
EFSA.head()

Unnamed: 0,Category (pest name),Keywords,Unnamed: 2,Unnamed: 3
0,AcaloleptaSejuncta-PHT,acalolepta+sejuncta,Au/Cabi,
1,AcalymmaVittatum-PHT,acalymma+vittata,Au/Cabi,
2,AcalymmaVittatum-PHT,acalymma+vittatum,Au/Cabi,
3,AcalymmaVittatum-PHT,chrysom_le%+ray_e+du+concombre,EOL/Cabi,Fr
4,AcalymmaVittatum-PHT,cistela+melanocephala,Au/Cabi,


In [6]:
get_csv_head(corpus_path, size=3)

Unnamed: 0,id,date_publication,titre,auteurs,journal,url,pertinence,sujet,pays_journal,fiabilite,...,class,lang_ft,lang_ft_confidence,fold,chi2_lang,chi2_cat,norm_title_extraction,norm_abstract_extraction,norm_translation_title,norm_translation_abstract
0,59183,2021/09/01,Bureau n° 7583/2021,,,https://dre.pt/web/guest/home/-/dre/168760770/...,3.0,2481.0,175.0,+++,...,1,ht,0.987593,8,99,991,,,,
1,59184,2021/09/01,Qu'est-ce que la bactérie Xylella fastidiosa ?,,,https://agriculture.gouv.fr/quest-ce-que-la-ba...,3.0,2482.0,77.0,+++,...,1,fr,0.97694,7,fr,fr1,qu est ce que la bactérie xylella fastidiosa,info 07 07 2021 santé protection des végétaux ...,xylella fastidiosa bacteria,info 07 07 2021 health plant protection xylell...
2,59186,2021/09/03,Introgression among North American wild grapes...,"A Morales-Cruz, JA Aguirre-Liguori, Y Zhou, A ...",Genome Biol,https://www.ncbi.nlm.nih.gov/pubmed/34479604,0.0,,,,...,0,en,0.800591,8,en,en0,introgression among north american wild grapes...,abstract background introgressive hybridizatio...,introgression among north american wild grapes...,abstract background introgressive hybridizatio...


#### Extract texts from corpus

In [7]:
## USE SAMPLE
#corpus_path = sample_csv(corpus_path, in_col_select_rows={'norm_title_extraction':['huanglongbing']})
#re_path = sample_csv(re_path, in_col_select_rows={'Keywords':['candidatus']})
#print_csv_head(corpus_path, size=6)

In [8]:
csv.field_size_limit(sys.maxsize)
TEXT = {}
LANG = {}
with open(corpus_path) as f:
    r = csv.DictReader(f, delimiter=',')
    for cols in r:
        txt = ' '.join(cols[tf].strip() for tf in TEXT_FIELDS)
        lang = cols['lang_ft'].strip()
        TEXT[cols['id']] = txt
        LANG[cols['id']] = lang

#### def functions

In [9]:
def to_pattern(syn):
    r = syn.replace('+', ' ')
    r = r.replace('_', '\\w')
    if r.endswith('%'):
        r = r[:-1] + '\\w*'
    r = r.replace('% ', '\\w* ')
    r = r.replace('%', '\\S*\\s?')
    return r

DATA = dict()
def match_keyword(pht,s):
    sre = to_pattern(s)
    pat = re.compile(sre, flags=re.IGNORECASE)
    #pat = re.compile('huanglongbing', flags=re.IGNORECASE)
    matches = set()
    docids = set()
    for did, txt in TEXT.items():
        ms = pat.findall(txt)
        if len(ms) > 0:
            for m in ms:
                if m not in DATA.keys():
                    DATA[m] = {'langs': set(),
                                'PHT' : pht,
                                'doc_ids' : set()}
                DATA[m]['langs'].add(str(LANG[did]))
                DATA[m]['doc_ids'].add(did)
    return 

#### Match patterns in corpus

In [10]:
#sys.stderr.write('matching\n')
with open(re_path) as f:
    READER = csv.reader(f, delimiter=',')
    DICT = collections.defaultdict(set)
    N = 0
    M = 0
    DOCIDS = set()
    for cols in tqdm(READER):
        N += 1
        if N == 1:
            continue
        key, syn, *_ = cols
        match_keyword(key,syn)
    #sys.stderr.write('%d/%d matched\n' % (M, N))

7559it [4:10:32,  1.99s/it]


In [11]:
## SAVE
import json

for m, mdata in DATA.items():
    for mdat, setdat in mdata.items():
        if not isinstance(setdat, str):
            DATA[m][mdat] = list(setdat)

with open(efsa_match_json_path, 'w') as f:
    json.dump(DATA, f)

In [12]:
with open(efsa_match_json_path, 'r') as f:
    DATA = json.load(f)
list(DATA.items())[0]

('Acalymma vittatum',
 {'langs': ['en'],
  'PHT': 'AcalymmaVittatum-PHT',
  'doc_ids': ['61664', '65355', '65991']})

In [13]:
## Caracol
#list(DATA.items())[8:12]

### Match with Google Search

## Extract EFSA Scientific Term from NCBI

### Load Data

In [14]:
## Load EPPO_SCI
with open(eppo_sci_path, 'r') as f:
    EPPO_SCI = pd.read_csv(f, on_bad_lines='skip', sep=',')#index_col=0 )
EPPO_SCI.head()

Unnamed: 0,OtherScientificNames,Authority,CodeEOPP,PreferredName,AuthorityPreferredName
0,Citrus greening bacterium (heat-sensitive strain),,LIBEAF,'Candidatus Liberibacter africanus',"Jagoueix, Bové & Garnier"
1,Liberibacter africanum,,LIBEAF,'Candidatus Liberibacter africanus',"Jagoueix, Bové & Garnier"
2,Liberibacter africanus,,LIBEAF,'Candidatus Liberibacter africanus',"Jagoueix, Bové & Garnier"
3,Liberibacter americanus,,LIBEAM,'Candidatus Liberibacter americanus',"Teixeira, Saillard, Eveillard, Danet, da Costa..."
4,Citrus greening bacterium (heat-tolerant strain),,LIBEAS,'Candidatus Liberibacter asiaticus',"Jagoueix, Bové & Garnier"


In [15]:
## Load NCBI
col_names = ['term','ID','kingdom', 'taxon_path', 'POS', 'taxon_ranking', 'NA1', 'NA2', 'lang']
with open(ncbi_path, 'r') as f:
    NCBI = pd.read_csv(f, on_bad_lines='skip', sep='\t', names=col_names )#index_col=0 )
NCBI.head()

Unnamed: 0,term,ID,kingdom,taxon_path,POS,taxon_ranking,NA1,NA2,lang
0,Bacteria,ncbi:2,Bacteria,/ncbi:1/ncbi:131567/ncbi:2,NP,superkingdom,,,
1,bacteria,ncbi:2,Bacteria,/ncbi:1/ncbi:131567/ncbi:2,NNS,superkingdom,,,
2,eubacteria,ncbi:2,Bacteria,/ncbi:1/ncbi:131567/ncbi:2,NN,superkingdom,,,
3,Monera,ncbi:2,Bacteria,/ncbi:1/ncbi:131567/ncbi:2,NP,superkingdom,,,
4,Procaryotae,ncbi:2,Bacteria,/ncbi:1/ncbi:131567/ncbi:2,NP,superkingdom,,,


In [16]:
## Load EFSA json
with open(efsa_match_json_path, 'r') as f:
    DATA = json.load(f)
list(DATA.items())[0]

('Acalymma vittatum',
 {'langs': ['en'],
  'PHT': 'AcalymmaVittatum-PHT',
  'doc_ids': ['61664', '65355', '65991']})

### Def

In [17]:
def PHT_to_lower_taxon(PHT_code):
    taxon = re.sub(r'(?<!^)(?=[A-Z])', ' ', PHT_code[:-4]).lower()
    return taxon
print('AcalymmaVittatum-PHT:',PHT_to_lower_taxon('AcalymmaVittatum-PHT')) # test

AcalymmaVittatum-PHT: acalymma vittatum


### Align EFSA taxa with NCBI and EPPO

Add translation data to `json` by aligning it with NCBI taxa 

In [18]:
ncbi_terms = list(set(NCBI['term'].astype(str)))
eppo_sci_terms = list(set(EPPO_SCI['OtherScientificNames'].astype(str)))
#lowercase_ncbi_terms = list(map(lambda x: str(x).lower(), ncbi_terms))

In [19]:
for m, md in tqdm(DATA.items()):
    efsa_taxon = PHT_to_lower_taxon(DATA[m]['PHT'])
    #md['lowercase_taxon'] = efsa_taxon
    ncbi_taxa_matching_efsa_term = [t for t in ncbi_terms if efsa_taxon in str(t).lower()]
    md['sci_term'] = ncbi_taxa_matching_efsa_term
list(DATA.items())[0]

100%|██████████| 1960/1960 [49:08:00<00:00, 90.25s/it]        


('Acalymma vittatum',
 {'langs': ['en'],
  'PHT': 'AcalymmaVittatum-PHT',
  'doc_ids': ['61664', '65355', '65991'],
  'sci_term': ['Acalymma vittatum (Fabricius)',
   'Acalymma vittatum Fabricius',
   'Acalymma vittatum (Fabricius, 1775)',
   'Acalymma vittatum']})

In [None]:
for m, md in tqdm(DATA.items()):
    if m in ncbi_terms or m in eppo_sci_terms:
        DATA[m]['langs'] = ['en']
    if m in eppo_sci_terms:
        DATA[m]['langs'] = ['la'] # TODO differenciate between scientific name and vernacular
list(DATA.items())[0]

In [20]:
len(DATA)

1960

In [21]:
with open(efsa_match_json_path, 'w') as f:
    json.dump(DATA, f)

## Make EFSA Glossary

Use extracted info to create a translation glossary

In [22]:
## Load EFSA json
with open(efsa_match_json_path, 'r') as f:
    DATA = json.load(f)
list(DATA.items())[0]

('Acalymma vittatum',
 {'langs': ['en'],
  'PHT': 'AcalymmaVittatum-PHT',
  'doc_ids': ['61664', '65355', '65991'],
  'sci_term': ['Acalymma vittatum (Fabricius)',
   'Acalymma vittatum Fabricius',
   'Acalymma vittatum (Fabricius, 1775)',
   'Acalymma vittatum']})

In [23]:
efsa_glossary = dict(term=list(),
                    lang=list(),
                    sci_term=list())

for m, md in DATA.items():
    for lang in (md['langs']):
        efsa_glossary['term'].append(m)
        efsa_glossary['lang'].append(lang)
        efsa_glossary['sci_term'].append(max(list(md['sci_term']), key=len) if md['sci_term'] else None)
efsa_glossary_df = pd.DataFrame(efsa_glossary)

In [24]:
efsa_glossary_df.head()

Unnamed: 0,term,lang,sci_term
0,Acalymma vittatum,en,"Acalymma vittatum (Fabricius, 1775)"
1,Chrysomèle rayée du concombre,fr,"Acalymma vittatum (Fabricius, 1775)"
2,chrysomèle rayée du concombre,fr,"Acalymma vittatum (Fabricius, 1775)"
3,striped cucumber beetles,en,"Acalymma vittatum (Fabricius, 1775)"
4,Striped cucumber beetles,en,"Acalymma vittatum (Fabricius, 1775)"


### Save EFSA Glossary

In [25]:
with open(efsa_glossary_path, 'w') as f:
    efsa_glossary_df.to_csv(f, index=False)