In [492]:
# Import standard python modules
import json, time, urllib.parse

# Impoert non-standard Python modules. You will need to install these with pip/pip3 if you do not already have it
import requests
import pandas as pd
import numpy as np

In [3]:
# Read in given csv files
politician_df = pd.read_csv('politicians_by_country_AUG.2024.csv')
population_df = pd.read_csv('population_by_country_AUG.2024.csv')

### Page info request to get current revision id

Starter code:
The only change made was that I added my email

In [7]:
#########
#
#    CONSTANTS
#

# The basic English Wikipedia API endpoint
API_ENWIKIPEDIA_ENDPOINT = "https://en.wikipedia.org/w/api.php"
API_HEADER_AGENT = 'User-Agent'

# We'll assume that there needs to be some throttling for these requests - we should always be nice to a free data resource
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making automated requests we should include something that is unique to the person making the request
# This should include an email - your UW email would be good to put in there
REQUEST_HEADERS = {
    'User-Agent': '<ehh4@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2024'
}

# This is just a list of English Wikipedia article titles that we can use for example requests
ARTICLE_TITLES = [ 'Bison', 'Northern flicker', 'Red squirrel', 'Chinook salmon', 'Horseshoe bat' ]

# This is a string of additional page properties that can be returned see the Info documentation for
# what can be included. If you don't want any this can simply be the empty string
PAGEINFO_EXTENDED_PROPERTIES = "talkid|url|watched|watchers"
#PAGEINFO_EXTENDED_PROPERTIES = ""

# This template lists the basic parameters for making this
PAGEINFO_PARAMS_TEMPLATE = {
    "action": "query",
    "format": "json",
    "titles": "",           # to simplify this should be a single page title at a time
    "prop": "info",
    "inprop": PAGEINFO_EXTENDED_PROPERTIES
}

In [189]:
#########
#
#    PROCEDURES/FUNCTIONS
#

def request_pageinfo_per_article(article_title = None,
                                 endpoint_url = API_ENWIKIPEDIA_ENDPOINT,
                                 request_template = PAGEINFO_PARAMS_TEMPLATE,
                                 headers = REQUEST_HEADERS):

    # article title can be as a parameter to the call or in the request_template
    if article_title:
        request_template['titles'] = article_title

    if not request_template['titles']:
        raise Exception("Must supply an article title to make a pageinfo request.")

    if API_HEADER_AGENT not in headers:
        raise Exception(f"The header data should include a '{API_HEADER_AGENT}' field that contains your UW email address.")

    if 'uwnetid@uw' in headers[API_HEADER_AGENT]:
        raise Exception(f"Use your UW email address in the '{API_HEADER_AGENT}' field.")

    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like Wikipedia - or any other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(endpoint_url, headers=headers, params=request_template)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

End starter code

Get revision id's and add them to a new column, 'lastrevid', in a copy of the politiican dataframe

In [125]:
# Create copy of politician DataFrame
revid_df = politician_df.copy()
# Create a new column in the DataFrame 'lastrevid' for the last revision id's to be stored
revid_df['lastrevid'] = None

In [493]:
# Create list to store names of article titles that have to available revision id
no_rev_id_list = []


def get_lastrevid(name):
    """
    Collects last revision ID for the given article title using the Wikipedia pageinfo API function
    If the last revision ID is not available, the article title is added to the no_rev_id_list which
    defined outside the funtion.

    Args:
        name: The name of the article for the pageinfo API request, type str

    Returns:
        last revision id (type int) value if one exists for article or None if one does not exist
    """
    # Try except block to catch exception if API return is not what is expected
    try:
        # Call pageinfo API
        info = request_pageinfo_per_article(name)
        # Parse through API result to get the last revision id
        headers = info['query']['pages']
        number_key = list(headers.keys())[0]

        return headers[number_key]['lastrevid']

    except Exception as e:
        # If API result did not contain expected values / format, add article name to no_rev_id_list and return none
        no_rev_id_list.append(name)
        return None

In [None]:
# Apply the get_lastrevid function to each article title, name, in the DataFrame and store the value in 'lastrevid' column
revid_df['lastrevid'] = revid_df['name'].apply(get_lastrevid)

### ORES Request

starter code:

Changes made: added my email and my own username and access token

In [28]:
#########
#
#    CONSTANTS
#

#    The current LiftWing ORES API endpoint and prediction model
#
API_ORES_LIFTWING_ENDPOINT = "https://api.wikimedia.org/service/lw/inference/v1/models/{model_name}:predict"
API_ORES_EN_QUALITY_MODEL = "enwiki-articlequality"

#
#    The throttling rate is a function of the Access token that you are granted when you request the token. The constants
#    come from dissecting the token and getting the rate limits from the granted token. An example of that is below.
#
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = ((60.0*60.0)/5000.0)-API_LATENCY_ASSUMED  # The key authorizes 5000 requests per hour

#    When making automated requests we should include something that is unique to the person making the request
#    This should include an email - your UW email would be good to put in there
#
#    Because all LiftWing API requests require some form of authentication, you need to provide your access token
#    as part of the header too
#
REQUEST_HEADER_TEMPLATE = {
    'User-Agent': "<{email_address}>, University of Washington, MSDS DATA 512 - AUTUMN 2024",
    'Content-Type': 'application/json',
    'Authorization': "Bearer {access_token}"
}
#
#    This is a template for the parameters that we need to supply in the headers of an API request
#
REQUEST_HEADER_PARAMS_TEMPLATE = {
    'email_address' : "ehh4@uw.edu",         # your email address should go here
    'access_token'  : ""          # the access token you create will need to go here
}

#
#    A dictionary of English Wikipedia article titles (keys) and sample revision IDs that can be used for this ORES scoring example
#
ARTICLE_REVISIONS = { 'Bison':1085687913 , 'Northern flicker':1086582504 , 'Red squirrel':1083787665 , 'Chinook salmon':1085406228 , 'Horseshoe bat':1060601936 }

#
#    This is a template of the data required as a payload when making a scoring request of the ORES model
#
ORES_REQUEST_DATA_TEMPLATE = {
    "lang":        "en",     # required that its english - we're scoring English Wikipedia revisions
    "rev_id":      "",       # this request requires a revision id
    "features":    True
}

#
#    These are used later - defined here so they, at least, have empty values
#
USERNAME = ""
ACCESS_TOKEN = ""
#

In [29]:
# Fill in own username and access token for Wikipedia API account
my_username = " "
my_access_token = " "

In [30]:
#   Once you've done the right set up with your Wikimedia account, it should provide you with three different keys, a Client ID,
#   a Client secret, and a Access token.
#
#   In this case I don't want to distribute my keys with the source of the notebook, so I wrote a key manager object that helps
#   track all of my API keys - a username and domain name retrieves the key. The key manager hides the keys on disk separate
#   from the code. A common code idiom to hide API keys will use code to extract the key from an OS environment variable.
#
#   In the Homework 2 folder you should be able to find a zip file containing the apikeys user module. Install this module
#   into the folder where you keep all of your user modules. This is also the folder that your PYTHONPATH variable points to.
#
from apikeys.KeyManager import KeyManager
keyman = KeyManager()

#
#   This is my Wikipedia/Wikimedia username. They suggest you request your keys using your Wikipedia username, so I
#   also stored the API key using my Wikipedia username.
#
#   You should probably use your own username here.
####print(key_info[0]['description'])
#print(ACCESS_TOKEN)
#
#   Note: if you don't want to use the key manager to help manage your API keys, you can specify the values as constants
#   below. Just don't distribute the notebook without removing the constants or you'll be distributing your key too.
#
USERNAME = my_username
ACCESS_TOKEN = my_access_token

In [31]:
#########
#
#    PROCEDURES/FUNCTIONS
#

def request_ores_score_per_article(article_revid = None, email_address=None, access_token=None,
                                   endpoint_url = API_ORES_LIFTWING_ENDPOINT,
                                   model_name = API_ORES_EN_QUALITY_MODEL,
                                   request_data = ORES_REQUEST_DATA_TEMPLATE,
                                   header_format = REQUEST_HEADER_TEMPLATE,
                                   header_params = REQUEST_HEADER_PARAMS_TEMPLATE):

    #    Make sure we have an article revision id, email and token
    #    This approach prioritizes the parameters passed in when making the call
    if article_revid:
        request_data['rev_id'] = article_revid
    if email_address:
        header_params['email_address'] = email_address
    if access_token:
        header_params['access_token'] = access_token

    #   Making a request requires a revision id - an email address - and the access token
    if not request_data['rev_id']:
        raise Exception("Must provide an article revision id (rev_id) to score articles")
    if not header_params['email_address']:
        raise Exception("Must provide an 'email_address' value")
    if not header_params['access_token']:
        raise Exception("Must provide an 'access_token' value")

    # Create the request URL with the specified model parameter - default is a article quality score request
    request_url = endpoint_url.format(model_name=model_name)

    # Create a compliant request header from the template and the supplied parameters
    headers = dict()
    for key in header_format.keys():
        headers[str(key)] = header_format[key].format(**header_params)

    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free data
        # source like ORES - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        #response = requests.get(request_url, headers=headers)
        response = requests.post(request_url, headers=headers, data=json.dumps(request_data))
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response


End starter code

In [110]:
# Create list to store article revision ids that do not have available ORES scores
no_score_list = []

def get_ores_score(revid):
    """
    Collects the ORES article quality score for a given revision id using the Wikipedia ORES API.
    If the ORES score is not available, the revision id is added to the no_score_list, which is defined
    outside the function.

    Args:
        revid: The revision id of the article to use for ORES API request, type int

    Returns:
       ORES score value (type str) if one exists for article or None if one does not exist
    """
    # Request the ORES score for the given article revision id
    score = request_ores_score_per_article(article_revid=revid,
                                           email_address="ehh4@uw.edu",
                                           access_token=ACCESS_TOKEN)

    # Check if the API response contains the expected values
    if 'enwiki' in score:
        # Parse through API return
        headers = score["enwiki"]["scores"]
        number_key = list(headers.keys())[0]

        # Return the article quality score
        return headers[number_key]['articlequality']['score']['prediction']
    else:
        # If the expected ORES data is not present, add revision id to the no_score_list and return None
        no_score_list.append(revid)
        return None

In [None]:
# Create a subset of revid_df where 'name' is not in no_rev_id_list because these have None for 'lastrevid' value
ores_df = revid_df.loc[~revid_df['name'].isin(no_rev_id_list)].copy()

In [120]:
# Create a new column in the DataFrame 'ores' for the ores score to be stored
ores_df['ores'] = None

In [121]:
# Apply the get_ores_score function to each last revision id, lastrevid, in the DataFrame and store the value in 'ores' column
ores_df['ores'] = ores_df['lastrevid'].apply(get_ores_score)

In [505]:
# Get names of the last revision id's that had no Ores score (no_score_list)
no_ores_list = revid_df[revid_df['lastrevid'].isin(no_score_list)]['name'].tolist()

# Combine the list of names that did not have last revision ids and the list of names that did not have ores scores
export_list = no_ores_list + no_rev_id_list

# Create DataFrame with this list and export it to csv
no_score_df = pd.DataFrame({'name' : export_list})
no_score_df.to_csv('no_score.csv')

In [506]:
# Compute score error rate (num articles with no score / total num articles)
error_rate = len(export_list)/ len(politician_df)
print(f'The score error rate is {error_rate}')

The score error rate is 0.0030747728860936407


### Process population_df to get the countries and their corresponding region 

In [425]:
country_list = [] # List of countries for country_region_df
region_list = [] # List of regions for country_region_df
region_only = [] # List of regions for region_df
region_pop = [] # List of region populations for region_df
region = '' # initialize temp variable

# Loop through 'Geography' column
for g in population_df['Geography']:


    if g.isupper():
        # If g is uppercase assign to temp variable
        region = g
        # Get the population for the region
        pop = population_df[population_df['Geography'] == region]['Population'].values[0]
        # Append population to region_pop list
        region_pop.append(pop)
        # Append region to region_only list
        region_only.append(region)
        # Continue to next value in 'Geography' column, this allows for only the last of back to back uppercase values to be stores for a countries corresponding region
        continue

    # If g is not uppercase then append g to country_list and append stored region to region_list
    country_list.append(g)
    region_list.append(region)




In [420]:
# Create a DataFrame for countries and thier regions
county_region_df = pd.DataFrame({ 'country' : country_list,
                                 'region': region_list
                                 })

In [170]:
# Merge the county_region_df with the 'Geography' and 'Population' columns of population_df to get the population for each country
country_region_pop_df = county_region_df.merge(population_df[['Geography','Population']],
                              left_on='country',
                              right_on='Geography',
                              how='inner')


In [171]:
# Drop 'Gerography' column
country_region_pop_df.drop(columns='Geography', inplace=True)
# Rename column
country_region_pop_df.rename(columns={'Population' :
                              'population'}, inplace=True)

In [437]:
# Create a DataFrame for regions and their populations
region_df = pd.DataFrame({ 'region' : region_only,
                            'population': region_pop
                                 })

In [229]:
# Merge the ores_df with country_region_pop_df
merged_df = country_region_pop_df.merge(ores_df[['name', 'country', 'lastrevid', 'ores']],
                                        left_on='country',
                                        right_on='country',
                                        how='right')

In [269]:
# Rename columns
merged_df.rename(columns= {'name' : 'article_title',
                            'lastrevid' : 'revision_id',
                            'ores' : 'article_quality'}, inplace=True)


Note: The following are politicians that are listed multiple times for different countries, I am chosing to leave these in because after researching some, I found that they may have been a politician in a country that was made up of multiple of the current countries at the time they were serving thier term.

In [270]:
merged_df[merged_df.duplicated(subset=['article_title'])]

Unnamed: 0,country,region,population,article_title,revision_id,article_quality
1844,Czechia,EASTERN EUROPE,10.9,Count Václav Antonín Chotek of Chotkov and Vojnín,1214682376,Start
1855,Czechia,EASTERN EUROPE,10.9,Eduard Hedvicek,1215043825,Stub
1895,Czechia,EASTERN EUROPE,10.9,"Leopold, Count von Thun und Hohenstein",1215186791,C
2104,Ethiopia,EASTERN AFRICA,126.5,Ibrahim Harun,1131754356,Stub
2586,Honduras,CENTRAL AMERICA,9.7,José Francisco Barrundia,1227213476,Start
2859,Indonesia,SOUTHEAST ASIA,278.7,Manuel Carrascalão,1209003568,Start
3589,Korean,,,Bak Jungyang,1215565447,C
3667,Kosovo,SOUTHERN EUROPE,1.7,Visar Ymeri,1248230838,Stub
3691,Kyrgyzstan,CENTRAL ASIA,6.8,Torokul Dzhanuzakov,1242155530,C
3851,Lithuania,NORTHERN EUROPE,2.9,Tadeusz Kościuszko,1249440331,FA


### Analysis

Top 10 countries by coverage: The 10 countries with the highest total articles per capita (in descending order) .

In [331]:
# Get count of articles per country by counting article_title
article_count_by_country = merged_df.groupby('country').count()['article_title']

In [294]:
# Get DataFrame of unique countries and thier populations
country_pop = merged_df[~merged_df.duplicated(subset = 'country')][['country', 'population']]

In [464]:
# Merge countries and their populations with the count of article titles
article_per_capita = country_pop.merge(article_count_by_country,
                                       left_on = 'country',
                                        right_on= 'country',
                                         how='inner' )

In [465]:
# Replace countires with 0.0 population for calculation of articles per capita
article_per_capita.replace(0.0, np.nan, inplace=True)

In [466]:
# Create calculated column for articles per capita
article_per_capita['articles_per_capita'] = article_per_capita['article_title'] / article_per_capita['population']

In [470]:
# Sort by articles_per_capita in descending order and print table
print('10 countries with the highest total articles per capita (articles per million people)')
article_per_capita[['country', 'articles_per_capita']].sort_values(by='articles_per_capita', ascending=False).head(10)

10 countries with the highest total articles per capita (articles per million people)


Unnamed: 0,country,articles_per_capita
4,Antigua and Barbuda,330.0
99,Federated States of Micronesia,140.0
97,Marshall Islands,130.0
153,Tonga,90.0
12,Barbados,83.333333
132,Seychelles,60.0
103,Montenegro,58.333333
94,Maldives,55.0
17,Bhutan,55.0
126,St. Vincent and the Grenadines,40.0


Bottom 10 countries by coverage: The 10 countries with the lowest total articles per capita (in ascending order) .

In [471]:
# Sort by articles_per_capita in ascending order and print table
print('10 countries with the lowest total articles per capita (articles per million people)')
article_per_capita[['country', 'articles_per_capita']].sort_values(by='articles_per_capita', ascending=True).head(10)

10 countries with the lowest total articles per capita (articles per million people)


Unnamed: 0,country,articles_per_capita
33,China,0.011337
58,Ghana,0.087977
67,India,0.105698
129,Saudi Arabia,0.135501
167,Zambia,0.148515
111,Norway,0.181818
71,Israel,0.204082
47,Egypt,0.304183
73,Cote d'Ivoire,0.323625
52,Ethiopia,0.347826


Top 10 countries by high quality: The 10 countries with the highest high quality articles per capita (in descending order) .

In [473]:
# Filter merged_df for article qualities in GA or FA to get high quality only
quality_df = merged_df[merged_df['article_quality'].isin(['GA', 'FA'])]

In [474]:
# Get counts of GA or FA articles for each country by counting article_title from filtered df, quality_df
article_counts = quality_df.groupby('country').count()['article_title']

In [475]:
# Get DataFrame of unique countries and thier populations
country_pop = merged_df[~merged_df.duplicated(subset = 'country')][['country', 'population']]

In [476]:
# Merge quality counts and county/population together
high_quality_per_capita = country_pop.merge(article_counts,
                                       left_on = 'country',
                                        right_on= 'country',
                                         how='inner' )

In [477]:
# Create calculated column for high quality articles per capita
high_quality_per_capita['high_quality_articles_per_capita'] = high_quality_per_capita['article_title'] / high_quality_per_capita['population']

In [478]:
print('10 countries with the highest high quality articles per capita (articles per million people)')
high_quality_per_capita[['country', 'high_quality_articles_per_capita']].sort_values(by='high_quality_articles_per_capita', ascending=False).head(10)

10 countries with the highest high quality articles per capita (articles per million people)


Unnamed: 0,country,high_quality_articles_per_capita
66,Montenegro,3.333333
58,Luxembourg,2.857143
1,Albania,2.592593
52,Kosovo,2.352941
61,Maldives,1.666667
57,Lithuania,1.37931
26,Croatia,1.315789
41,Guyana,1.25
71,Palestinian Territory,1.090909
83,Slovenia,0.952381


Bottom 10 countries by high quality: The 10 countries with the lowest high quality articles per capita (in ascending order).

In [327]:
# The countries with no high quality articles were filtered out above so we can find the countries that are missing from the merged DataFrame, high_quality_per_capita
zero_high_quality = country_pop[~country_pop['country'].isin(high_quality_per_capita['country'])]

In [480]:
# Create column for high quality articles per capita
zero_high_quality['high_quality_articles_per_capita'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  zero_high_quality['high_quality_articles_per_capita'] = 0


In [484]:
# Set row limit to show all countries with 0 high quality articles per capita
pd.set_option('display.max_rows', 67)
print('Countries with the lowest high quality articles per capita (articles per million people)')
zero_high_quality[['country', 'high_quality_articles_per_capita']]

Countries with the lowest high quality articles per capita (articles per million people)


Unnamed: 0,country,high_quality_articles_per_capita
284,Antigua and Barbuda,0
570,Bahamas,0
694,Barbados,0
835,Belize,0
844,Benin,0
851,Bhutan,0
895,Guinea-Bissau,0
973,Botswana,0
1343,Cape Verde,0
1367,Chad,0


Geographic regions by total coverage: A rank ordered list of geographic regions (in descending order) by total articles per capita.

In [485]:
# Get count of articles per region by counting article_title
article_count_by_region = merged_df.groupby('region')['article_title'].count()

In [486]:
# Merge article counts and region/population together
region_per_capita = region_df.merge(article_count_by_region,
                                       left_on = 'region',
                                        right_on= 'region',
                                         how='inner' )

In [487]:
# Create calculated column for articles per capita
region_per_capita['articles_per_capita'] = region_per_capita['article_title'] / region_per_capita['population']

In [488]:
# Sort values by articles_per_capita in descending order and reset the index
rank_ordered_region = region_per_capita[['region', 'articles_per_capita']].sort_values(by='articles_per_capita', ascending=False).reset_index(drop=True)

In [489]:
print('Rank ordered list of geographic regions (in descending order) by total articles per capita (articles per million people)')
rank_ordered_region

Rank ordered list of geographic regions (in descending order) by total articles per capita (articles per million people)


Unnamed: 0,region,articles_per_capita
0,SOUTHERN EUROPE,5.236842
1,CARIBBEAN,4.954545
2,WESTERN EUROPE,2.497487
3,EASTERN EUROPE,2.480702
4,WESTERN ASIA,2.0301
5,NORTHERN EUROPE,1.768519
6,SOUTHERN AFRICA,1.757143
7,OCEANIA,1.555556
8,EASTERN AFRICA,1.374741
9,SOUTH AMERICA,1.333333


Geographic regions by high quality coverage: Rank ordered list of geographic regions (in descending order) by high quality articles per capita.

In [445]:
# Filter merged_df for article qualities in GA or FA
quality_df = merged_df[merged_df['article_quality'].isin(['GA', 'FA'])]

In [446]:
# Get counts of GA or FA articles for each region by counting article_title from filtered df, quality_df
high_quality_region_article_counts = quality_df.groupby('region').count()['article_title']

In [447]:
# Merge article counts and region/population together
high_quality_region_per_capita = region_df.merge(high_quality_region_article_counts,
                                       left_on = 'region',
                                        right_on= 'region',
                                         how='inner' )

In [448]:
# Create calculated column for articles per capita
high_quality_region_per_capita['high_quality_articles_per_capita'] = high_quality_region_per_capita['article_title'] / high_quality_region_per_capita['population']

In [490]:
# Sort values by high_quality_articles_per_capita in descending order and reset the index
high_quality_rank_ordered_region = high_quality_region_per_capita[['region', 'high_quality_articles_per_capita']].sort_values(by='high_quality_articles_per_capita', ascending=False).reset_index(drop=True)

In [491]:
print('Rank ordered list of geographic regions (in descending order) by high quality articles per capita (articles per million people)')
high_quality_rank_ordered_region

Rank ordered list of geographic regions (in descending order) by high quality articles per capita (articles per million people)


Unnamed: 0,region,high_quality_articles_per_capita
0,SOUTHERN EUROPE,0.342105
1,CARIBBEAN,0.204545
2,EASTERN EUROPE,0.133333
3,SOUTHERN AFRICA,0.114286
4,WESTERN EUROPE,0.105528
5,WESTERN ASIA,0.090301
6,NORTHERN EUROPE,0.083333
7,NORTHERN AFRICA,0.066406
8,CENTRAL ASIA,0.0625
9,CENTRAL AMERICA,0.054945
