# Program to query IATE database with the REST API and save it to database
see : https://iate.europa.eu/developers

and : https://documenter.getpostman.com/view/4028985/RztoMTwn?version=latest#api-keys


see also (EU responses to covid-19) : https://eur-lex.europa.eu/homepage.html?locale=en

and : https://op.europa.eu/en/web/eudatathon/covid-19

and : https://eur-lex.europa.eu/homepage.html

http://inmyownterms.com/covid-19-glossaries-dictionaries-terminology/


# A. Database / tables creation (sqlite3)

In [31]:
# create database and tables
# https://www.sqlitetutorial.net/

import sqlite3
    
conn = sqlite3.connect('db/iate-covid19.db')

c = conn.cursor()
# Create tables
langs = ['cs','da','de','en','es','fi','fr','it','nl','is','no','pt']


c.execute('''DROP TABLE IF EXISTS concepts''')
c.execute('''CREATE TABLE concepts
             (id INT PRIMARY KEY, en_lexemes TEXT, cs_def TEXT,da_def TEXT,de_def TEXT,en_def TEXT,es_def TEXT,fi_def TEXT,fr_def TEXT,it_def TEXT,nl_def TEXT,is_def TEXT,no_def TEXT,pt_def TEXT)''')
# Create table concepts_crossrefs
c.execute('''DROP TABLE IF EXISTS concepts_relations''')
c.execute('''CREATE TABLE concepts_relations
             (id_concept1 INT, id_concept2 INT, relation INT, UNIQUE(id_concept1,id_concept2, relation))''')
# Create table concepts_crossrefs_def
c.execute('''DROP TABLE IF EXISTS concepts_relations_types''')
c.execute('''CREATE TABLE concepts_relations_types
             (id INT PRIMARY KEY, label VARCHAR(50)''')
# Create table concepts_domains
c.execute('''DROP TABLE IF EXISTS concepts_domains''')
c.execute('''CREATE TABLE concepts_domains
             (id_concept INT, id_domain INT, UNIQUE(id_concept,id_domain))''')
# Create table domains
c.execute('''DROP TABLE IF EXISTS domains''')
c.execute('''CREATE TABLE domains
             (id TEXT PRIMARY KEY, label TEXT, level INT, parent TEXT, label_var TEXT)''')
# Create table langages
c.execute('''DROP TABLE IF EXISTS languages''')
c.execute('''CREATE TABLE languages
             (id VARCHAR(10) PRIMARY KEY, label TEXT)''')
# Create table lexemes
c.execute('''DROP TABLE IF EXISTS lexemes''')
c.execute('''CREATE TABLE lexemes
             (id INT PRIMARY KEY, value VARCHAR(255), query VARCHAR(255), id_concept INT, lang VARCHAR(10), type INT, context TEXT)''')
# Create table lexemes_types
c.execute('''DROP TABLE IF EXISTS lexemes_types''')
c.execute('''CREATE TABLE lexemes_types
             (id INT PRIMARY KEY, label VARCHAR(50))''')
# select examples
c.execute("SELECT * FROM concepts")
print(c.fetchone())
print(c.fetchall())
for row in c.execute('SELECT * FROM lexemes'):
    print(row)

# Save (commit) the changes
conn.commit()
conn.close()


None
[]


## Get available languages and populate database / table languages

In [3]:
# get languages
import requests

url = "https://iate.europa.eu/em-api/inventories/_languages?trans_lang=en&expand=true&limit=300&offset=0"
langs = ['cs','da','de','en','es','fi','fr','it','nl','is','no','pt']

payload = {}
headers = {
  'Accept': 'application/json'
}

resp = requests.request("GET", url, headers=headers, data = payload)

if resp.status_code == 200:
    #contents = print(response.text.encode('utf8'))
    res = []
    for item in resp.json()['items']:
        res.append((item['code'],item['name']))
        #print(item['code'], item['name'])

conn = sqlite3.connect('db/iate-covid19.db')
c = conn.cursor()
c.executemany('INSERT INTO languages VALUES (?,?)', res)
conn.commit()
conn.close()

## Get term types and save them to database / table lexemes_types

In [4]:
# term types
import requests

url = "https://iate.europa.eu/em-api/inventories/_term-types?trans_lang=en&expand=true&limit=20&offset=0"
payload = {}
headers = {
    'Accept': 'application/json'
}

resp = requests.request("GET", url, headers=headers, data = payload)

if resp.status_code == 200:
    res=[]
    for item in resp.json()['items']:
        print(item['code'], item['name'])
        res.append((item['code'],item['name']))

conn = sqlite3.connect('db/iate-covid19.db')
c = conn.cursor()
c.executemany('INSERT INTO lexemes_types VALUES (?,?)', res)
conn.commit()
conn.close()

def get_id_labels(table, db):
    conn = sqlite3.connect(db)
    c = conn.cursor()
    res ={}
    query = 'SELECT id,label FROM ' + table
    #print(query)
    for row in c.execute(query):
        res[row[0]]=row[1]
        
    conn.close()
    return res


termtypes = get_id_labels('lexemes_types','db/iate-covid19.db')
print(termtypes)

0 abbrev
1 formula
2 phrase
3 short form
4 term
5 lookup form
6 appellation
{0: 'abbrev', 1: 'formula', 2: 'phrase', 3: 'short form', 4: 'term', 5: 'lookup form', 6: 'appellation'}


## Get domains and populate database / table domains

In [5]:
# fill domains
# get domains
import requests
import pickle

def parse_tree_domains(domain, parent, parentname, res):
        if 'lookups' in domain.keys():
            lookup = ",".join(domain['lookups'])
        else:
            lookup = ''
        if parentname =='' or parentname =='Domain code not specified':
            parentname = domain['name']
        else:
            parentname = parentname +' > ' + domain['name'] 
        #print(parentname)
        res.append((domain['code'],parentname,domain['level'],parent,lookup))
        if 'subdomains' in domain.keys():
            for d in domain['subdomains']:
                parse_tree_domains(d,domain['code'],parentname, res)
        
def query_domains():
    url = "https://iate.europa.eu/em-api/domains/_tree"
    payload = {}
    headers = {
      'Accept': 'application/json'
    }
    resp = requests.request("GET", url, headers=headers, data = payload)
    if resp.status_code == 200:
        res = []
        data = resp.json()['items']
        #print(data)
        for d in data:
            parse_tree_domains(d, '','', res)
        return res
                    
conn = sqlite3.connect('db/iate-covid19.db')
c = conn.cursor()
res = query_domains()
#print(res)
c.executemany('INSERT INTO domains VALUES (?,?,?,?,?)', res)
conn.commit()
conn.close()


In [6]:
conn = sqlite3.connect('db/iate-covid19.db')
c = conn.cursor()
c.execute("SELECT * FROM domains where level=5")
print(c.fetchone())
conn.commit()
conn.close()


def get_label(id, table, db):
    conn = sqlite3.connect(db)
    c = conn.cursor()
    query = 'SELECT label FROM ' + table + ' where id="' + id + '"'
    #print(query)
    c.execute(query)
    res = c.fetchall()
            

    conn.commit()
    conn.close()
    return res

def get_domains(table, db):
    conn = sqlite3.connect(db)
    c = conn.cursor()
    res ={}
    query = 'SELECT id,label FROM ' + table
    #print(query)
    for row in c.execute(query):
        res[row[0]]=row[1]
        
    conn.close()
    return res


get_label('9A4F05026F3245BD95BE7DFCE54764AC', 'domains','db/iate-covid19.db')
domains = get_domains('domains','db/iate-covid19.db')
#for d in domains.keys():
#    print(d,domains[d])

('5C182C28AE7A4E578AC83588DDCC4235', 'POLITICS > political framework > political philosophy > democracy > deliberative democracy', 5, '8CA089860329450C9C521843B6F7032B', 'discursive democracy,democratic deliberation')


## Populate concepts and lexemes tables

In [20]:
import pprint,pickle, re
from stop_words import get_stop_words

pp = pprint.PrettyPrinter(indent=4)


def query_code_lexemes(query,lang='en'):
    '''
    Query IATE db for a concept code and return the lexical items in the requested language.
    
        Parameters:
                    query (str): the concept code in IATE
                    lang (str): the language code

        Returns:
                    res (list): the list of lexemes
    
    '''
    resp = requests.post('https://iate.europa.eu/em-api/entries/_search?expand=true&limit=5', 
                     json={'query':query,'search_in_fields':[8], 'source':'en'})
    if resp.status_code == 200:
        res=[]
        #print(len(resp.json()['items']), " entries")
        # just get first one
        if 'items' in resp.json():
            item = resp.json()['items'][0]
            # get lang entries     
            if 'language' in item.keys() and lang in item['language']:
                for entry in item['language'][lang]['term_entries']:
                    res.append(entry['term_value'])
            return res
        else:
            print("Strange behavior : no lexemes for this code :", resp.json())
            return []
        
    else:
        print(resp.status_code)
        return False
     
def query_code_id(query):
    '''
    Query IATE db for a concept code and return the id of the concept.
    
        Parameters:
                    query (int): the concept code in IATE

        Returns:
                    id (int): the id of the concept
    
    '''
    resp = requests.post('https://iate.europa.eu/em-api/entries/_search?expand=true&limit=5', 
                     json={'query':query,'search_in_fields':[8], 'source':'en'})
    if resp.status_code == 200:
        res=[]
        #print(len(resp.json()['items']), " entries")
        # just get first one
        if 'items' in resp.json():
            item = resp.json()['items'][0]
            return item['id']
        else :
            print("Strange behavior : no items for this code")
            print(resp.json())
            return ''
    else:
        print(resp.status_code)
        return False
  

def query_id(query, langs):
    '''
    Query IATE db for a concept code and return all the linked information (concept, concept relations, concept domains, lexemes).
    
        Parameters:
                    query (int): the concept code in IATE
                    langs (list): the list of languages to be queried for lexemes

        Returns:
                    allitems, concept,concept_rels,concept_domains, lexemes
                    allitems (list): the list of all information retrieved
                    concept (list): the list of information for the concept (id, list of entries in English, definitions in all required languages, if exist)
                    concept_rels (list): the list of the concept relations to other concepts (id_concept1, id_concept2, relation_type)
                    concept_domains (list): the list of the domains the concept belongs to (id_concept, id_domain)
                    lexemes (list): the list of lexemes linked to the concept, for all required languages (id_lexeme,lexeme_value,id_concept, lang, lexeme_type,context)   
    '''

    resp = requests.post('https://iate.europa.eu/em-api/entries/_search?expand=true&limit=30', 
                     json={'query':query,'search_in_fields':[9], 'source':'en', 'targets':langs})
    if resp.status_code == 200:
        #print(resp.json())
        if not('items' in resp.json()):
            print(resp.json())
            return [], [],[],[], []
        print(str(len(resp.json()['items'])) + " entries")
        items = resp.json()['items']
        # structure for displaying results (debug)
        allitems=[]
        for item in resp.json()['items']:
            itemdata={}
            # get concept id
            itemdata['id'] = query # or item['id']
            #print(item['id'],item['score']) # item.keys(), 
            # crossrefs
            if 'crossrefs' in item.keys():
                #print(item['crossrefs'])
                crossrefs = [(query_code_lexemes(crossref['code']),crossref['type']) for crossref in item['crossrefs']]
                concept_rels = [(itemdata['id'], query_code_id(crossref['code']),crossref['type']) for crossref in item['crossrefs']]
                itemdata['crossrefs'] = crossrefs
                #print("Related concepts : ",crossrefs)
            else:
                concept_rels=[]
            # get domains
            if 'domains' in item.keys():
                domaincodes = [domains[dom['code']]  if dom['code'] in domains else dom['code'] for dom in item['domains']]
                concept_domains = [(itemdata['id'],dom['code']) for dom in item['domains']]
                itemdata['domains'] = domaincodes
                #print('domains:', domaincodes)
            else:
                concept_domains=[]
            # get language entries
            if 'language' in item.keys():
                # structure for saving to db/table
                lexemes=[]
                #print(item['language'].keys())
                for lang in item['language']:
                    if lang in langs:
                        # structure for display/debug purposes
                        itemdata[lang] = {}
                        itemdata[lang]['entries']=[]
                        #print('***'*20,"\nlang:",lang,"\n")#,item['language'][lang].keys()
                        # definition : we store it at the concept level
                        if 'definition' in item['language'][lang].keys():
                            #itemdata[lang]['definition']=(item['language'][lang]['definition'],item['language'][lang]['definition_references'][0]['text'])
                            itemdata[lang + '_def']= item['language'][lang]['definition'] + " - " + item['language'][lang]['definition_references'][0]['text']
                        else:
                            #itemdata[lang]['definition']=('','') 
                            itemdata[lang + '_def']=''
                        # entries
                        for entry in item['language'][lang]['term_entries']:
                                #lexemes [(id INT PRIMARY KEY, value TEXT, id_concept INT, lang TEXT, type INT, definition TEXT, context TEXT)]
                                itementry = {}
                                itementry['value']= entry['term_value']
                                # patch for mapping Czech iso code
                                if lang == 'cs':
                                    lang2 = 'cz'
                                    stopw = get_stop_words(lang2)
                                else:
                                    stopw = get_stop_words(lang)
                                itementry['query'] = " ".join([w for w in re.split(r"\W",entry['term_value'], re.I) if not(w in stopw)])

                                itementry['type']= termtypes[entry['type']]
                                if 'contexts' in entry.keys(): # just keep first context
                                    itementry['context'] = entry['contexts'][0]['context']
                                    itementry['context_ref'] = entry['contexts'][0]['reference']['text']
                                    if 'language_usage' in entry['contexts'][0].keys():
                                        itementry['language_usage'] = entry['contexts'][0]['language_usage']
                                    else:
                                        itementry['language_usage']=''
                                    if 'regional_usage' in entry['contexts'][0].keys():
                                        itementry['regional_usage'] = entry['contexts'][0]['regional_usage']
                                    else:
                                        itementry['regional_usage']=''
                                else:
                                    itementry['context'] = ''
                                    itementry['context_ref'] = ''
                                    itementry['language_usage']=''
                                    itementry['regional_usage']=''
                                    
                                lexeme = (entry['id'],itementry['value'],itementry['query'],itemdata['id'], lang, entry['type'],itementry['context'])
                                lexemes.append(lexeme)
                                itemdata[lang]['entries'].append(itementry)
                        allitems.append(itemdata)
                        # concept with english_samples and definition
                        if 'en' in itemdata.keys():
                            #print(itemdata['en'])
                            entries = ", ".join([itementry['value'] for itementry in itemdata['en']['entries']])
                            lang_defs = [itemdata[lang + '_def'] if lang + '_def' in itemdata else '' for lang in langs]
                            concept = [itemdata['id'],entries]
                            concept.extend(lang_defs)
        # before returning check if concept defined. otherwise, create empty one with default value (no lexemes)
        if not('concept' in vars()):
            concept = [query,'']
        return allitems, concept,concept_rels,concept_domains, lexemes
    else:
        print(resp.status_code)
        return [],[],[],[],[]

def save_iate_to_db(data, table, conn):
    '''
    Save data retrieved from IATE to sqlite database/table
    
        Parameters:
            data(list) : a list of tuples to save to db
            table (str): the name of the table to save to
            conn (obj): the connection to the sqlite3 db file
            
        Returns:
            True|False (bool) : True or False depending on success
    '''
    # check data is not empty
    if len(data)==0:
        print('Empty data')
        return False
    # first get number of fields in data elements
    nb = len(data[0])
    placeholders = ",".join(['?' for i in range(nb)])
    print(str(nb) + ' fields')
    # generate query
    #'INSERT INTO concepts VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)'
    query = 'INSERT INTO ' + table + ' VALUES (' + placeholders + ')'
    print(query)

    try:
        #conn=sqlite3.connect(db)
        c = conn.cursor()
        c.executemany(query,data)
        conn.commit()
        print("Total insertions into " + table + ": ",conn.total_changes)
        #conn.close()
        return True
    except sqlite3.Error as e:
        print("Error : " + str(e))
        return False

def save_to_pickle(data,pathdir):  
    outfile = open(pathdir,'wb')
    pickle.dump(data,outfile)
    outfile.close()

# main : test on given concept id
langs = ['cs','da','de','en','es','fi','fr','it','nl','is','no','pt']
id_concept = '3588006'
save_dir = './save/'
res, concept,concept_rels,concept_domains, lexemes = query_id(id_concept, langs)
print("concept fields : " + str(len(concept))) # ok
print("concept relations : " + str(len(concept_rels))) # ok
print("concept domains : " + str(len(concept_domains))) # ok
print(str(len(lexemes)) + " lexemes")
# save to pickle
save_to_pickle(concept, save_dir + id_concept + '_concept.pickle')
save_to_pickle(concept_rels, save_dir + id_concept + '_concept_relations.pickle')
save_to_pickle(concept_domains, save_dir + id_concept + '_concept_domains.pickle')
save_to_pickle(lexemes, save_dir + id_concept + '_lexemes.pickle')
        
# now populate datababase/tables
conn=sqlite3.connect('db/iate-covid19.db')
save_iate_to_db([concept], 'concepts', conn)
save_iate_to_db(concept_rels, 'concepts_relations', conn)
save_iate_to_db(concept_domains, 'concepts_domains', conn)
save_iate_to_db(lexemes, 'lexemes', conn)
conn.close()




1 entries
concept fields : 14
concept relations : 5
concept domains : 2
69 lexemes
14 fields
INSERT INTO concepts VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)
Error : UNIQUE constraint failed: concepts.id
3 fields
INSERT INTO concepts_relations VALUES (?,?,?)
Error : UNIQUE constraint failed: concepts_relations.id_concept1, concepts_relations.id_concept2, concepts_relations.relation
2 fields
INSERT INTO concepts_domains VALUES (?,?)
Error : UNIQUE constraint failed: concepts_domains.id_concept, concepts_domains.id_domain
7 fields
INSERT INTO lexemes VALUES (?,?,?,?,?,?,?)
Error : UNIQUE constraint failed: lexemes.id


# Launch process for the list of covid-19 related concepts (as of July 2020 from IATE website)

In [None]:
# load reference data
import pandas as pd
import sys
import datetime


df = pd.read_csv('./resources/OP_Covid19_IATE_2872020.csv')
df.drop(['IATE entry URL'], inplace=True, axis=1) # ,'IATE entry URL.1'
print(df.info())
ids = set(df['IATE ID'].unique())
print("Unique concepts : ", len(ids))
# launch search into IATE db and save data to db
# required languages
langs = ['cs','da','de','en','es','fi','fr','it','nl','is','no','pt']
save_dir = './save/'

for id in ids :
    id_concept = str(id)
    print("Processing concept id : " + id_concept)
    res, concept,concept_rels,concept_domains, lexemes = query_id(id_concept, langs)
    print("concept fields : " + str(len(concept))) # ok
    print("concept relations : " + str(len(concept_rels))) # ok
    print("concept domains : " + str(len(concept_domains))) # ok
    print(str(len(lexemes)) + " lexemes")
    # save to pickle
    save_to_pickle(concept, save_dir + id_concept + '_concept.pickle')
    save_to_pickle(concept_rels, save_dir + id_concept + '_concept_relations.pickle')
    save_to_pickle(concept_domains, save_dir + id_concept + '_concept_domains.pickle')
    save_to_pickle(lexemes, save_dir + id_concept + '_lexemes.pickle')
            
    # now populate datababase/tables
    conn=sqlite3.connect('db/iate-covid19.db')
    save_iate_to_db([concept], 'concepts', conn)
    save_iate_to_db(concept_rels, 'concepts_relations', conn)
    save_iate_to_db(concept_domains, 'concepts_domains', conn)
    save_iate_to_db(lexemes, 'lexemes', conn)
    conn.close()

That's it! Now all data is saved into sqlite db. Check db subdirectory to find aditionnal script to manage db (especially to convert to mysql).

# Requests to db

In [164]:
import pandas,re

# 1 Request id concept, definition, list of domains, and for all concerned languages the list of lexicalizations

conn = sqlite3.connect('db/iate-covid19.db')
conn.row_factory = sqlite3.Row
c = conn.cursor()

# structure query
#q = 'select sql from sqlite_master where type = "table" and name = "concepts";'
#c.execute(q)
#print(c.fetchone())
#q = 'select d.label as domain from concepts_domains as cd left join domains as d on cd.id_domain = d.id  where cd.id_concept="3588006"'
#c.execute(q)
#res = [dict(row) for row in c.fetchall()]
#print(res)
#print("*"*10)

# all data in array
alldata = []
# data query
q = 'select id,en_lexemes, en_def from concepts'
c.execute(q)
res = [dict(row) for row in c.fetchall()]
#print(res)
#print("*"*10)

for elt in res:
    # create dict structure for this concept with id_concept,en_lex, all languages lexicalizations, domains
    eltdata={}
    eltdata['id_concept']= elt['id']
    eltdata['iate_link']= 'https://iate.europa.eu/entry/result/' + str(elt['id'])
    if 'en_def' in elt:
        definition = re.sub(r"<[^>+?]*?>","", elt['en_def'])
        eltdata['en_def']= definition
    else :
        eltdata['en_def']= ''

    eltdata['en']= elt['en_lexemes']
    
    # lexemes
    lexs = {}
    q = 'select * from lexemes where id_concept="' + str(elt['id'])  + '"'
    c.execute(q)
    for row in c.fetchall():
        elt2 = dict(row)
        lang = elt2['lang']
        if lang in lexs:
            data = lexs[lang]
            data.append(elt2['value'])
            lexs[lang] = data
        else:
            lexs[lang] = [elt2['value']]
    for lang in lexs:
        eltdata[lang]= ", ".join(lexs[lang])

    # domains
    q = 'select d.label as domain from concepts_domains as cd left join domains as d on cd.id_domain = d.id  where cd.id_concept="' + str(elt['id'])  + '"'
    c.execute(q)
    for row in c.fetchall():
        dom = dict(row)
        #print(dom)
        #print("*"*10)
        if 'domains' in eltdata:
            data = eltdata['domains']
            data.append(dom['domain'])
            eltdata['domains']=data
        else:
            eltdata['domains']=[dom['domain']]
    domains =  eltdata['domains']
    eltdata['domains']=", ".join(domains)
    #print(eltdata)
    alldata.append(eltdata)
    
conn.close()

# save to json file
import json
with open('iate_data.json', 'w') as f:
    json.dump(alldata, f)

# then convert to excel and csv
pandas.read_json("iate_data.json").to_excel("iate_data.json.xlsx", index=False)
pandas.read_json("iate_data.json").to_csv("iate_data.json.csv", index=False)

In [167]:
# get the most diffused anglicims (ie words in English that also appaeared in other languages)
import re
conn = sqlite3.connect('db/iate-covid19.db')
conn.row_factory = sqlite3.Row
c = conn.cursor()

# main query
q = 'select c.id as id_concept, c.en_lexemes as en_lexemes, l.value as lexeme, l.lang as lang from concepts as c join lexemes as l on c.id = l.id_concept where l.lang !="en";'
c.execute(q)
res = [dict(row) for row in c.fetchall()]
#print(res)
#print("*"*10)

# english keywords
stoplist=['for','the','and','2019','joint','open','time','type','error','Equity','Growth','acquired','unit','audio','order','value','partial','app','Loan','extra','asset','assets','towards','single','Next','Generation','NextGenerationEU','sector','level','security','negative','frontier','operation','context','operations','network','communicable','agent','lay','period','body','means','Board','specific','against']
kw_en ={}
for elt in res:
    lexemes = elt['en_lexemes'].split(', ')
    for lex in lexemes:
        kws = lex.split(' ')
        for kw in kws:
            if kw in kw_en and len(kw) > 2 and not(kw in stoplist):
                kw_en[kw]= kw_en[kw]+1
            else:
                kw_en[kw]= 1
#print(kw_en)
with open('iate_data.covid19.english_keywords.csv', mode="w") as fout:
    for elt in sorted(kw_en, key=lambda x:kw_en[x],reverse=True):
        fout.write(elt +','+str(kw_en[elt]) + "\n")
    

# structure : anglicims['covid19']=['cs','fr']
anglicisms={}
anglicisms_part={}
for elt in res:
    en_lexs = elt['en_lexemes'].split(', ')
    #print(elt['id_concept'], en_lexs)
    r = re.compile(elt['lexeme'])
    # lexeme is found (exactly or as a part) in one or more of the en lexemes
    # to be done : conversely the lexeme is found as a part in one or maore the en_lexemes
    matches = list(filter(r.search, en_lexs))
    if len(matches)> 0:
        #print(matches, elt['lexeme'], elt['lang'])
        for match in matches:
            if match in anglicisms:
                existing = anglicisms[match]
                existing.append(elt['lang'])
                anglicisms[match]=existing
            else:
                anglicisms[match] = [elt['lang']]
    else:
        # if any of english keywords is present in lexeme
        # conversely the lexeme is found as a part in one or maore the en_lexemes
        sorted_en = sorted(en_lexs,reverse=True)
        #sorted_en2 = sorted(en_lexs,reverse=False)
        #print(sorted_en)
        #print(sorted_en2)
        #print("*"*10)
        r2 = re.compile(r'(' + '|'.join(sorted_en) + r')')
        matches2 = list(filter(r.search, elt['lexeme']))
        if len(matches)> 0:
            #print(matches, elt['lexeme'], elt['lang'])
            for match in matches:
                if match in anglicisms_part:
                    existing = anglicisms_part[match]
                    existing.append(elt['lang'])
                    anglicisms_part[match]=existing
                else:
                    anglicisms_part[match] = [elt['lang']]

print(anglicisms) 
print("*"*10)
print(anglicisms_part)

# write results to csv

with open('iate_data.covid19.en_diffused.csv', mode="w") as fout:
    for elt in sorted(anglicisms, key=lambda x:len(anglicisms[x]),reverse=True):
#        print(elt)
#        print(anglicisms[elt])
        fout.write(elt + "," + str(anglicisms[elt]) + "\n")


{'SARS-CoV-2': ['cs', 'de', 'es', 'fi', 'fr', 'it', 'nl', 'pt'], '2019-nCoV': ['cs', 'da', 'de', 'es', 'fi', 'fr', 'it', 'nl', 'pt'], 'severe acute respiratory syndrome coronavirus 2': ['da', 'nl'], 'CSPP': ['da', 'de', 'es', 'fr', 'it', 'pt'], 'EWRS': ['cs', 'de'], 'CoV': ['cs', 'da', 'de', 'es', 'fi', 'fr', 'it', 'nl'], 'coronavirus': ['da', 'es', 'fr', 'it', 'nl'], 'interferon': ['da', 'it', 'nl'], 'IFN': ['de', 'fr', 'it'], 'LVP': ['fr'], 'LVP solution': ['fr'], 'web conference': ['it'], 'CEAOB': ['cs', 'fi', 'fr', 'it', 'nl'], 'MSC': ['it'], 'GPMB': ['de', 'es', 'fr', 'it', 'nl'], 'Global Preparedness Monitoring Board': ['it'], 'ritonavir': ['cs', 'da', 'es', 'fr', 'it', 'nl', 'pt'], 'contact': ['nl'], 'conference call': ['it'], 'RMS': ['de'], 'triage': ['cs', 'da', 'fi', 'fr', 'it', 'nl'], 'immunoglobulin': ['da'], 'capsid': ['da'], 'EU Integrated Political Crisis Response (IPCR) arrangements': ['da', 'de', 'fi', 'fr', 'fr', 'pt'], 'EU IPCR arrangements': ['da', 'de', 'fi', 'fr',