In [10]:
# In this script, we use a list of country names in all EU languages
# to find when and how much each country was mentioned in the parliament.

In [11]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from tqdm.notebook import tqdm

In [12]:
pd.set_option('display.max_rows', 500)

In [13]:
# The links with all EU member countries pages on Wikipedia + the observers
# Palestine and Vatican City
def get_links(url):
    
    r = requests.get(url)
    
    soup = BeautifulSoup(r.text)
    
    table = soup.find('table', class_='wikitable')
    
    rows = table.find_all('tr')
    
    links = []
    for row in rows[1:]:
       
        link = row.find('th', scope='row').find('a')['href']
        link = f"https://en.wikipedia.org{link}"
        links.append(link)
        
        
    palestine = 'https://en.wikipedia.org/wiki/State_of_Palestine'
    vatican = 'https://en.wikipedia.org/wiki/Vatican_City'
    links.extend([palestine, vatican])
    
    return links

In [14]:
def get_title(link):
    
    identifier = link.split("/wiki/")[-1]
    
    base_url = "https://en.wikipedia.org/w/api.php"
    
    params = {
        "action": "query",
        "format": "json",
        "titles": identifier,
        "redirects": 1  # This ensures that redirects are resolved
    }
    
    response = requests.get(base_url, params=params)
    data = response.json()
    pages = data.get('query').get('pages')
    
    # There should be only one page, so we return the first title
    for page in pages.values():
        
        if page['title'] == 'Realm of New Zealand':
            return 'New Zealand'
        elif page['title'] == 'Kingdom of the Netherlands':
            return 'Netherlands'
        elif page['title'] == 'C%C3%B4te_d%27Ivoire':
            return "Ivory Coast"
        elif page['title'] == 'Democratic_People%27s_Republic_of_Korea':
            return 'North Korea'
        elif page['title'] == 'Lao_People%27s_Democratic_Republic':
            return 'Laos'
        elif page['title'] == 'S%C3%A3o_Tom%C3%A9_and_Pr%C3%ADncipe':
            return 'São Tomé and Príncipe'
        
        else:
            return page['title']
    

In [15]:
# List of languages
def get_translation(languages):
    
    languages = sorted(languages)
    
    links = get_links("https://en.wikipedia.org/wiki/Member_states_of_the_United_Nations")
    titles = [get_title(link) for link in links]
    
    assert len(links) == 195
    
    rows = []

    # For each link, do the following...
    for link, title in tqdm(zip(links, titles)):

        # Query the url
        api_url = f"https://en.wikipedia.org/w/api.php"
        
        params = {
                "action": "query",
                "format": "json",
                "titles": title,
                "prop": "langlinks",
                "llprop": "autonym|langname|url",
                "lllimit": "max",
        }

        r = requests.get(api_url, params=params).json()
        
        # Extract the 'pages' dictionary from the response
        pages = r.get('query', {}).get('pages', {}) # Syntax for acessing nested dictionaries, returning a empty one if the key is misisng
        results = []
        
        # Iterate over the pages (although typically there should be only one)
        for page_id, page_data in pages.items():

            # Get the list of language links
            langlinks = page_data.get('langlinks', [])

            # Iterate over the language links
            for langlink in langlinks:

                # Check if the language link is in the languages of list
                if langlink['lang'] in languages:
                    # Store the title in the respective language
                    results.append(langlink['*'])

        
        # Save the title as it will be the english version
        results.append(title)
        #results.append(link)

            
        rows.append(results)
        
    # Add columns accounting for the english title
    columns = languages.copy()
    columns.append('en')

    df = pd.DataFrame(rows, columns=columns)

    return df

In [16]:
def main():
    
    # The 23 EU languages (without English, which we will infer from the title)
    languages = [
        'bg', 'hr', 'cs', 'da', 'nl', 'et', 'fi', 'fr', 'de', 'el', 'hu', 'ga', 
        'it', 'lv', 'lt', 'mt', 'pl', 'pt', 'ro', 'sk', 'sl', 'es', 'sv'
    ]
    
    df = get_translation(languages)
    
    
    display(df)
    
    df.to_csv("../output/country-names/first-try.csv")

In [17]:
if __name__ == "__main__":
    main()

KeyboardInterrupt: 