In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
import pandas as pd
from qwikidata.entity import WikidataItem
from qwikidata.json_dump import WikidataJsonDump
from qwikidata.utils import dump_entities_to_json
import pywikibot
import requests

In [17]:
file = open("country_url_end.txt", "r")
url_end_dic = dict()
for index, line in enumerate(file):
    url_end_dic[line[:-1].split('\t')[1][1:]] = line[:-1].split('\t')[0]

In [4]:
def get_country_website(url, url_end_dic) -> str:  
    """This function finds the country in which the company of the url is based,
    e.g. for www.guardian.co.uk it will return Great-Brittain
    input:
        url: str, url of which the country needs to be found
        url_end_dic: dict, dictionary of countries for which url was already found
    output:
        country: str, found country"""
    
    country =  None

    url_ending = url.split('.')[-1]
    if url_ending in url_end_dic:
        country = url_end_dic[url_ending]

    else:
        q_website = get_identifier(url)
        if q_website:
            country = get_country_from_website(q_website)

        if q_website is None or country is None:
            url_try_list = url.split('.')
            url_try = max(url_try_list, key=len)
            q_website = get_identifier(url_try)
            if q_website:
                country = get_country_from_website(q_website)
            
    return country

def get_country_from_identifier(q_website):
    site = pywikibot.Site("wikidata", "wikidata")
    repo = site.data_repository()
    item = pywikibot.ItemPage(repo, q_website)
    
    if not item.isRedirectPage():
        item_dict = item.get()
        if "P17" in item_dict["claims"]:
            clm_list = item_dict["claims"]["P17"]
            for clm in clm_list:
                clm_trgt = clm.getTarget()   

                return clm_trgt.text["labels"]["en"]

def get_identifier(item) -> str:
    """This function finds the wikidata identifier for a given string input (item)
    input:
        -item: str, item of which you want the wikidata identfier
    output:
        : str, wikidata identfier"""
    
    params = dict (
            action='wbsearchentities',
            format='json',
            language='en',
            uselang='en',
            type='item',
            search=item
            )

    response = requests.get('https://www.wikidata.org/w/api.php?', params).json()
    if response.get('search'):
        return response.get('search')[0]['id']

In [2]:
def get_country_speaker(q_person) ->list:
    """This function finds the nationality of a person based on 
    a wikidata identifier (Q)
    input:
        -q_person: str, wikidata identifier of which you want to find the country
    output:
        : str, country """
    site = pywikibot.Site("wikidata", "wikidata")
    repo = site.data_repository()
    item = pywikibot.ItemPage(repo, q_person)
    item_dict = item.get()
    nationality = []
    
    if "27" in item_dict["claims"]:
        clm_list = item_dict["claims"]["P27"]
        for clm in clm_list:
            clm_trgt = clm.getTarget() 
            nationality.append(clm_trgt.text["labels"]["en"])
        
    else:
        return None

In [16]:
def assign_country_to_speaker(speaker_id, speaker_country_lib):
    if speaker_id in speaker_country_lib:
        speaker_country = speaker_country_lib[speaker_id]
    else:
        speaker_country = get_country_speaker(speaker_id)
        speaker_country_lib[speaker_id] = speaker_country #assign found country for future use
    return speaker_country, speaker_country_lib
        
    
def assign_country_to_url(url, url_country_lib):
    if url in url_country_lib:
        url_country = url_country_lib[url]
    else:
        url_country = get_country_website(url)
        url_country_lib[speaker_id] = url_country #assign found country for future use
    return url_country, url_country_lib
        