## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import json
import urllib.parse
import urllib.request

from ftfy import fix_text

## Load Entity Data

In [2]:
code_to_lang_dict = {
    "bg": "Bulgarian",
    "ca": "Catalan",
    "cs": "Czech",
    "da": "Danish",
    "de": "German",
    "en": "English",
    "es": "Spanish",
    "fr": "French",
    "hr": "Croatian",
    "hu": "Hungarian",
    "it": "Italian",
    "nl": "Dutch",
    "pl": "Polish",
    "pt": "Portuguese",
    "ro": "Romanian",
    "ru": "Russian",
    "sl": "Slovenian",
    "sr": "Serbian",
    "sv": "Swedish",
    "uk": "Ukrainian",
}

In [3]:
code_to_spacy_model_dict = {
    "ca": "ca_core_news_lg",
    "da": "da_core_news_lg",
    "de": "de_core_news_lg",
    "en": "en_core_news_lg",
    "es": "es_core_news_lg",
    "fr": "fr_core_news_lg",
    "hr": "hr_core_news_lg",
    "it": "it_core_news_lg",
    "nl": "nl_core_news_lg",
    "pl": "pl_core_news_lg",
    "pt": "pt_core_news_lg",
    "ro": "ro_core_news_lg",
    "ru": "ru_core_news_lg",
    "sv": "sv_core_news_lg",
    "uk": "uk_core_news_lg",
}

In [4]:
lang_codes = list(code_to_lang_dict.keys())

In [5]:
entity_analysis_df = pd.read_csv(
    "../../data/error_analysis/entity_analysis_language_and_accuracy_by_entity.csv"
)

In [6]:
entity_analysis_df.head()

Unnamed: 0,entity,num_correct,num_incorrect,total_usages,percent_accuracy,languages,num_languages,alternate_forms,dataset_ids
0,Prius,16,0,16,1.0,"{'sr': 1, 'uk': 1, 'nl': 1, 'sv': 1, 'ca': 1, ...",16,"{'sr': 'Приус', 'uk': 'Prius', 'nl': 'Prius', ...",['calinet_8922']
1,Sundar Pichai,18,1,19,0.947368,"{'sr': 1, 'uk': 1, 'nl': 1, 'sv': 1, 'ca': 1, ...",19,"{'sr': 'Сундар Пицхаи', 'uk': 'Сундар Пічаї', ...",['rome_5025']
2,People's Republic of China,17,0,17,1.0,"{'sr': 1, 'uk': 1, 'nl': 1, 'sv': 1, 'hu': 1, ...",17,"{'sr': 'Народна Република Кина', 'uk': 'Народн...",['rome_21333']
3,Sint Maarten,11,10,21,0.52381,"{'sr': 1, 'nl': 2, 'sv': 1, 'ca': 1, 'pl': 1, ...",14,"{'sr': 'Синт Маартен', 'nl': 'Sint Maarten', '...","['rome_8738', 'rome_20596']"
4,Haas House,9,5,14,0.642857,"{'sr': 1, 'nl': 1, 'sv': 1, 'hu': 1, 'ca': 1, ...",14,"{'sr': 'Хаас Хоусе', 'nl': 'Haas House', 'sv':...",['rome_8783']


So we have 23k entities to work with. We're interested in how many times they get mentioned on wikipedia.

In [7]:
len(entity_analysis_df['entity'])

23257

In [8]:
# for a given language, randomly sample <n> articles (max of 500).
# return a dict of their id and title.
def get_wikipedia_pages(lang, debug = False):
    
    # construct URL for API call
    articles_url = f"https://{lang}.wikipedia.org/w/api.php?action=query&list=random&format=json&rnnamespace=0&rnlimit=50&format=json"
    
    # grab data
    url = urllib.request.urlopen(articles_url)
    
    # read data
    data = url.read()
    
    # set encoding and load into obj
    encoding = url.info().get_content_charset('utf-8')
    obj = json.loads(data.decode(encoding))
    
    if 'query' not in obj or 'random' not in obj['query']:
        if debug:
            print(f"Unable to grab articles from {code_to_lang_dict[lang]} using URL {url}.")
        raise Exception
        
    mappings = obj['query']['random']
    ids = {}
    for m in mappings:
        ids[m['id']] = m['title']
        
    if debug:
        print(f"Fetched {len(ids)} articles from {code_to_lang_dict[lang]} wikipedia")
    return ids

In [9]:
info_to_check = get_wikipedia_pages(lang_codes[0], debug=True)

Fetched 50 articles from Bulgarian wikipedia


In [10]:
info_to_check

{515511: 'Hippocampus patagonicus',
 782753: 'Ден на енергетика',
 155589: 'Малки Зондски острови',
 278662: 'Градац',
 553496: 'Acropora bushyensis',
 658845: 'God Hates Us All',
 298323: 'Марк Юний Метий Руф',
 155968: 'Геополитика',
 438116: 'Даяна Апълярд',
 514282: 'Rhipidomys venustus',
 403001: 'Добромир Добрев',
 656236: 'Генчо Скордев',
 509713: 'Ficedula hyperythra',
 89001: 'Лази',
 122222: 'Териер',
 219032: 'Пафта',
 47846: 'Траурни потапници',
 207079: 'Инвърклайд',
 297410: 'Публий Стертиний Кварт',
 567237: 'Holothuria impatiens',
 82135: 'Кюдо',
 244488: 'Иван Георгиев (агроном)',
 782117: 'Джилиан Филип',
 467918: 'Извън обхват',
 517836: 'Cookeconcha contorta',
 675501: 'Август Лудвиг фон Барби-Мюлинген',
 633750: 'Георги Паспалев',
 430188: 'Вилейка',
 826479: 'Москитос',
 122020: 'Лейкуд (Калифорния)',
 118986: 'Венцислав Върбанов',
 649225: 'Филип Аврамов (композитор)',
 123828: 'Евмел',
 643241: 'Heinkel He 176',
 663471: 'Митрофан III Константинополски',
 35272:

In [11]:
'''
for an inputted article_id:title combination
we want to hit:
https://en.wikipedia.org/w/api.php?action=query&format=json&titles=Kerala&prop=extracts&explaintext
good response - {"batchcomplete":"","query":{"pages":{"14958":{"pageid":14958,"ns":0,"title":"Kerala"
bad response  - {"batchcomplete":"","query":{"pages":{"-1":{"ns":0,"title":"Kerala","missing":""}}}}
'''

def get_article_info(article_title, pageid, lang, debug=False):
    
    # val
    if article_title == '' or article_title is None:
        if debug:
            print("Can't parse empty title.")
        return {}
    
    if lang  == '' or lang is None:
        if debug:
            print("Input a language.")
        return {}
    
    lang = lang.lower()
      
    url = ""
    
    # format title via quote escapes to ensure non-ascii chars can get handed off properly
    quoted_title = urllib.parse.quote(article_title)
    
    # construct url
    # where lang is the language we are requested
    # and quoted title refers to our article
    info_url = f"https://{lang}.wikipedia.org/w/api.php?action=query&format=json&titles={quoted_title}&prop=extracts&explaintext&format=json"
        
    
    if debug:
        print(f"calling {info_url} to retrieve info about {article_title} from {lang} wiki.")
    
    # grab data
    try:
        url = urllib.request.urlopen(info_url)
    except UnicodeDecodeError:
        print(f"could not decode API call for {article_title} on {lang} wiki; url is {info_url}.")
        return {}
        
    # read content
    data = url.read()
        
    # set encoding and load into obj
    encoding = url.info().get_content_charset('utf-8')
    obj = json.loads(data.decode(encoding))
    
    if 'query' not in obj or 'pages' not in obj['query']:
        if debug:
            print(f"Error parsing response for {article_title} from {lang} wiki.")
        raise Exception
    
    # check for a 'missing'/bad response
    if -1 in obj['query']['pages'].keys():
        if debug:
            print(f"No wiki data found for {article_title} on {lang} wiki.")
        return {}
     
    # get pageid of the returned article
    data_pageid = list(obj['query']['pages'].keys())[0]
    
    # double check pageid matches the one returned by API
    if data_pageid != pageid:
        if debug:
            print(f"id mismatch -- excpected {pageid} but retrieved {data_pageid} for {article_title} on {lang} wiki.")
        return {}
    
    # check if text is properly returned
    if 'extract' not in obj['query']['pages'][data_pageid]:
        if debug:
            print(f"could not retrieve text from {pageid} {article_title} on {lang} wiki.")
        return {}
    
    # get text
    content = obj['query']['pages'][data_pageid]['extract']
    
    # fix text
    content = fix_text(content)
        
    return {article_title: content}

In [12]:
# get_article_info('Кун Теменужков','645695', 'bg', debug=True)
obama_info = get_article_info('Barack Obama','534366', 'en', debug=False)
# get_article_info('Penguin Books', '925064', 'fr', debug=True)
# get_article_info('Barack Obama','430434', 'es', debug=True)

In [15]:
def load_spacy_models(langs_to_use, code_to_spacy_model_dict):
    container = []
    for lang in langs_to_use:
        container.append({['lang']: spacy.load(code_to_spacy_model_dict[lang])})
    
    return container

In [16]:
# for inputted article content
# count how many of our target entities appear in the text
# always search english and the native language in-case of translation inconsistencies
def count_entities_in_article(target_entity, article_content, language, debug=False):
    return None

In [17]:
'''

nlp = spacy.load(...) # load your model

from collections import Counter

ents = Counter()

text = ... # your text
for ent in nlp(text).ents:
    ents[f"{ent.label_}:{ent.text}"] += 1

for key, val in ents.items():
    print(val, key, sep="\t")
'''

'\n\nnlp = spacy.load(...) # load your model\n\nfrom collections import Counter\n\nents = Counter()\n\ntext = ... # your text\nfor ent in nlp(text).ents:\n    ents[f"{ent.label_}:{ent.text}"] += 1\n\nfor key, val in ents.items():\n    print(val, key, sep="\t")\n'