## Imports

In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import json
import urllib.parse
import urllib.request

from ftfy import fix_text

## Load Entity Data

In [3]:
code_to_lang_dict = {
    "bg": "Bulgarian",
    "ca": "Catalan",
    "cs": "Czech",
    "da": "Danish",
    "de": "German",
    "en": "English",
    "es": "Spanish",
    "fr": "French",
    "hr": "Croatian",
    "hu": "Hungarian",
    "it": "Italian",
    "nl": "Dutch",
    "pl": "Polish",
    "pt": "Portuguese",
    "ro": "Romanian",
    "ru": "Russian",
    "sl": "Slovenian",
    "sr": "Serbian",
    "sv": "Swedish",
    "uk": "Ukrainian",
}

In [33]:
lang_codes = list(code_to_lang_dict.keys())

In [4]:
entity_analysis_df = pd.read_csv(
    "../../data/error_analysis/entity_analysis_language_and_accuracy_by_entity.csv"
)

In [5]:
entity_analysis_df.head()

Unnamed: 0,entity,num_correct,num_incorrect,total_usages,percent_accuracy,languages,num_languages,alternate_forms,dataset_ids
0,Prius,16,0,16,1.0,"{'sr': 1, 'uk': 1, 'nl': 1, 'sv': 1, 'ca': 1, ...",16,"{'sr': 'Приус', 'uk': 'Prius', 'nl': 'Prius', ...",['calinet_8922']
1,Sundar Pichai,18,1,19,0.947368,"{'sr': 1, 'uk': 1, 'nl': 1, 'sv': 1, 'ca': 1, ...",19,"{'sr': 'Сундар Пицхаи', 'uk': 'Сундар Пічаї', ...",['rome_5025']
2,People's Republic of China,17,0,17,1.0,"{'sr': 1, 'uk': 1, 'nl': 1, 'sv': 1, 'hu': 1, ...",17,"{'sr': 'Народна Република Кина', 'uk': 'Народн...",['rome_21333']
3,Sint Maarten,11,10,21,0.52381,"{'sr': 1, 'nl': 2, 'sv': 1, 'ca': 1, 'pl': 1, ...",14,"{'sr': 'Синт Маартен', 'nl': 'Sint Maarten', '...","['rome_8738', 'rome_20596']"
4,Haas House,9,5,14,0.642857,"{'sr': 1, 'nl': 1, 'sv': 1, 'hu': 1, 'ca': 1, ...",14,"{'sr': 'Хаас Хоусе', 'nl': 'Haas House', 'sv':...",['rome_8783']


So we have 23k entities to work with. We're interested in how many times they get mentioned on wikipedia.

In [6]:
len(entity_analysis_df['entity'])

23257

In [47]:
# for a given language, randomly sample <n> articles (max of 500).
# return a dict of their id and title.
def get_wikipedia_pages(lang, debug = False):
    articles_url = f"https://{lang}.wikipedia.org/w/api.php?action=query&list=random&format=json&rnnamespace=0&rnlimit=50&format=json"
    
    
    url = urllib.request.urlopen(articles_url)
    
    data = url.read()
    
    encoding = url.info().get_content_charset('utf-8')
    obj = json.loads(data.decode(encoding))
    
    if 'query' not in obj or 'random' not in obj['query']:
        if debug:
            print(f"Unable to grab articles from {code_to_lang_dict[lang]} using URL {url}.")
        raise Exception
        
    mappings = obj['query']['random']
    ids = {}
    for m in mappings:
        ids[m['id']] = m['title']
        
    if debug:
        print(f"Fetched {len(ids)} articles from {code_to_lang_dict[lang]} wikipedia")
    return ids

In [48]:
info_to_check = get_wikipedia_pages(lang_codes[0], debug=True)

Fetched 50 articles from Bulgarian wikipedia


In [49]:
info_to_check

{811139: 'Вентрология',
 212800: 'Заза',
 215038: 'Телевизионен продуцент',
 106193: 'Георги Димитров (дипломат)',
 221756: 'Грузинци',
 701675: 'Ненчо Илиев (писател)',
 648744: 'Покапаля',
 110978: 'Нанси Пелоси',
 327331: 'Мамерк Емилий Скавър',
 23929: 'Величково (област Варна)',
 813443: 'Шоуто на Патрик Звездата',
 49919: 'Пясъчна усойница',
 508699: 'Asthenes maculicauda',
 202356: 'Требине',
 486733: '763 (пояснение)',
 49000: 'Сухуми',
 581534: 'Хайнрих II (Насау-Байлщайн)',
 481296: 'Лудвиг фон Майсен',
 735399: 'Прекинато Градище',
 558041: 'Blennodesmus scapularis',
 421748: 'Because We Can - The Tour',
 198060: 'Евангелие от Лука',
 345344: 'Алберто Куадри',
 513685: 'Sylvilagus mansuetus',
 99367: 'Лъчезар Балтанов',
 786102: 'Станиш Дельохаджиев',
 168280: 'Димитър Савов',
 442360: 'Румен Стоянов (футболист, р. 1976)',
 487490: 'Санта Тереза Галура',
 404490: 'Лошице',
 491294: 'Молизе (село)',
 47053: 'Адриан (император)',
 638153: 'Али Хаменеи',
 166565: 'Олга Николаев

In [None]:
# for an inputted article_id:title combination
# get the content
# e.g. for Kerala

# https://en.wikipedia.org/w/api.php?action=query&format=json&titles=Kerala&prop=extracts&explaintext
# (will be followed by a bunch of stuff)
# good response - {"batchcomplete":"","query":{"pages":{"14958":{"pageid":14958,"ns":0,"title":"Kerala"

# implies there is no article or api call went rogue
# bad response - {"batchcomplete":"","query":{"pages":{"-1":{"ns":0,"title":"Kerala","missing":""}}}}
def get_article_info(title):
    safe_title = 
    info_url = f"https://{lang}wikipedia.org/w/api.php?action=query&format=json&titles={safe_title}&prop=extracts&explaintext&format=json"
    
    
    url = urllib.request.urlopen(articles_url)
    
    data = url.read()
    
    encoding = url.info().get_content_charset('utf-8')
    obj = json.loads(data.decode(encoding))

    fix_text

In [None]:
# for inputted article content
# count how many of our target entities appear in the text
# always search english and the native language in-case of translation inconsistencies
def count_entities_in_article():
    return None