In [32]:
"""
TODO: Fill this in.
"""

import csv
import os.path
import requests

AUGMENTED_PAGE_DATA_PATH = 'data-512-a2.csv'

COUNTRY_MAP = {
    "East Timorese" : "Timor-Leste",
    "Hondura" : "Honduras",
    "Rhodesian" : "Zimbabwe",
    "Salvadoran" : "El Salvador",
    "Samoan" : "Samoa",
    "São Tomé and Príncipe" : "Sao Tome and Principe",
    "Somaliland" : "Somalia",
    "South African Republic" : "South Africa",
    "South Korean" : "Korea, South"
}

DEFAULT_ROW_COUNT = 2
MODEL = 'wp10'
PAGE_DATA_PATH = 'page_data.csv'
POPULATION_DATA_PATH = 'Population Mid-2015.csv'
PER_CALL = 140
PROJECT = 'enwiki'


def augment_page_data(page_data, quality_dictionary, population_dictionary):
    """
    Augments page data with article quality and the population of the country
    in which the subject resides.

    @param page_data: The original page data
    @type page_data: list
    @param quality_dictionary: An article quality dictionary, indexed by
    revision ID
    @type quality_dictionary: dict
    @param population_dictionary: A population dictionary, indexed by country
    @type population_dictionary: dict
    @return: Page data augmented with article quality and the population of the
    country in which the subject resides
    @rtype: list
    """

    # Declare and initialize a dictionary of missing countries, and a list to
    # received the augmented page data.
    missing = {}
    new_page_data = [['country',
                      'article_name',
                      'revision_id',
                      'article_quality',
                      'population']]

    # Cycle for each row in the page data.
    for index in range(1, len(page_data)):

        # Get the indexed row.  Get the article revision and country name
        # for the first/next row.
        row = page_data[index]
        article_revision = row[2]
        country_name = get_country(row[0])

        # Write a message if the article revision is not in the quality
        # dictionary.  This really should not happen.
        if article_revision not in quality_dictionary:
            print('Missing quality entry for revision ID \'%s\'.' %
                  article_revision)

        # The article revision is in the quality dictionary.
        else:

            # Initialize, or increment the count of articles for the
            # given country name if the country name is not in the
            # population dictionary.
            if country_name not in population_dictionary:

                if country_name not in missing:
                    missing[country_name] = 1
                else:
                    missing[country_name] += 1

            # The country is in the population dictionary.  Create
            # an augmented page data row.
            else:
                new_page_data.append([country_name,
                                      row[1],
                                      article_revision,
                                      quality_dictionary[article_revision],
                                      population_dictionary[country_name]])

    # Describe the counts of articles for 'countries' that were missing
    # a population in the population dictionary.  Return the augmented page
    # data.
    print('The following is the counts of articles about persons in countries '
          'that are missing a registered population: %s' % missing)
    return new_page_data


def build_country_to_population(country_data):
    """
    Builds a dictionary of countries to their populations.

    @param country_data: A list of countries with name as the first field,
    and population as the fifth field
    @type country_data: list
    @return: A dictionary of countries to their population
    @rtype: dict
    """

    # Declare and initialize the population dictionary, and cycle for each
    # country in the list.
    population_dictionary = {}
    for index in range(3, len(country_data) - 1):

        # Add a new dictionary for the first/next country.
        population_dictionary[country_data[index][0]] =\
            int(country_data[index][4].replace(',', ''))

    # Return the population dictionary.
    return population_dictionary


def create_augmented_page_data():
    """
    Creates the augmented page data file.

    @return: None
    @rtype: None
    """

    # Read the page data from CSV.  Create the page quality map, and the
    # country-to-population map.  Using all of these, create the augmented
    # page data and write it to CSV.
    page_data = read_from_csv(PAGE_DATA_PATH)
    write_to_csv(AUGMENTED_PAGE_DATA_PATH,
                 augment_page_data(page_data,
                                   get_quality_all(page_data),
                                   build_country_to_population(
                                       read_from_csv(POPULATION_DATA_PATH))))


def get_article_quality(article_quality, revision_ids):
    """
    Gets predicted article quality for a series of revision IDs.  Returns a dictionary
    indexed by revision ID.  Possible values for each revision ID are:

    FA - Featured article
    GA - Good article
    B - B-class article
    C - C-class article
    Start - Start-class article
    Stub - Stub-class article

    @param article_quality: An existing dictionary of revision IDs to
    article quality
    @type article_quality: dictionary
    @param revision_ids: A series of revision IDs
    @type revision_ids: list or tuple
    @return: article_quality
    @rtype: dict
    """

    # Hardcoded endpoint for the ORES API
    endpoint = 'https://ores.wikimedia.org/v3/scores/{project}/?models={model}&revids={revids}'

    # The parameters to be passed to the ORES API
    params = {'project': PROJECT,
              'model': MODEL,
              'revids': '|'.join(str(x) for x in revision_ids)
             }

    # Call the API, and return the response as JSON.
    api_call = requests.get(endpoint.format(**params))
    response = api_call.json()

    # Build and return a dictionary of article quality predictions
    # indexed by revision ID. Return the article quality dictionary.
    for key, value in response[PROJECT]['scores'].items():
        article_quality[key] = value[MODEL]['score']['prediction']
    return article_quality


def get_country(country):
    """
    Determines if a given country is mapped to another name.

    @param country: A given country
    @type country: str
    @return: A mapped country name if a name exists in the country map,
    the unmapped parameter otherwise
    @rtype: str
    """

    # Reset the country name if a name exists in the country map, and
    # return the country.
    if country in COUNTRY_MAP:
        country = COUNTRY_MAP[country]
    return country


def get_quality_all(page_data, last_index=DEFAULT_ROW_COUNT):
    """
    Gets article quality for all revision IDs in a page data list, up
    to a given maximum.

    @param page_data: A page data list, formatted with revision ID as the
    third element in each row
    @type page_data: list or tuple
    @param last_index: The last index to consider
    @type last_index: int
    @return: article_quality
    @rtype: dict
    """

    # Use the the full length of the page data if the last index is less than
    # a minimum number of rows.
    if last_index <= DEFAULT_ROW_COUNT:
        last_index = len(page_data)

    # Declare and initialize the quality dictionary, and determine the number
    # of iterative calls.
    quality_dictionary = {}
    calls = last_index // PER_CALL

    # Declare and initialize the base index, and cycle for the given number of
    # full calls required to retrieve the indicated number of rows.
    base = 1
    for _ in range(0, calls):

        # Calculate the last index, and print a message.
        count = base + PER_CALL
        print('Retrieving quality rating for articles %d to %d...'
              % (base, count - 1))

        # Update the quality dictionary.
        quality_dictionary = make_quality_call(quality_dictionary,
                                               page_data,
                                               base,
                                               count)

        # Update the base index.
        base = count

    # Is the base index less than the last index?  If so, there is
    # a remaining number of rows...
    if base < last_index:

        # Print a message.
        print('Retrieving quality rating for articles %d to %d...' %
              (base, last_index - 1))

        # Update the quality dictionary with the remaining number of rows.
        quality_dictionary = make_quality_call(quality_dictionary,
                                               page_data,
                                               base,
                                               last_index)

    # Describe how long the dictionary is, and return is.
    print('Length of quality dictionary is %d' % len(quality_dictionary))
    return quality_dictionary


def make_quality_call(existing_dictionary, page_data, start, stop):
    """
    Makes a call to get article quality for a given set of indices into a page
    data list.

    @param existing_dictionary: An existing dictionary of quality entries
    indexed by revision ID
    @type existing_dictionary: dictionary
    @param page_data: A page data list, formatted with revision ID as the
    third element in each row
    @type page_data: list or tuple
    @param start: The first index to use, inclusive
    @type start: int
    @param stop: The last index, exclusive
    @type stop: int
    @return: article_quality
    @rtype: dict
    """

    # Declare and initialize an empty list of revision IDs.  Cycle for each row
    # in the given range.  Append the first/next ID to the list.
    ids = []
    for row in range(start, stop):
        ids.append(page_data[row][2])

    # Get article quality for the selected revision IDs.
    return get_article_quality(existing_dictionary, ids)


def read_augmented_csv():
    """
    Reads fields from the augmented page data file.

    @return: The rows read from the file
    @rtype: list
    """

    # Create the augmented page data file if it does not already exist.
    if not os.path.isfile(AUGMENTED_PAGE_DATA_PATH):
        create_augmented_page_data()

    # Read the file, and return the rows.
    return read_from_csv(AUGMENTED_PAGE_DATA_PATH)


def read_from_csv(file_name):
    """
    Reads fields from a CSV file.

    @param file_name: A file path.
    @type file_name: str
    @return: The rows read from the file
    @rtype: list
    """

    # Declare and initialize a empty row list.  Open a CSV reader using the
    # given file name.
    row_list = []
    with (open(file_name)) as csvfile:
        reader = csv.reader(csvfile)

        # Append the row for each row read by the reader.
        for row in reader:
            row_list.append(row)

    # Return the row list.
    return row_list


def write_to_csv(file_name, row_list):
    """
    Writes fields to a CSV file.

    @param file_name: A file path.
    @type file_name: str
    @param row_list: The rows to write to the file
    @type row_list: list
    """

    # Open a CSV writer using the given file name.  Write the given rows.
    with(open(file_name, 'w')) as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(row_list)


augmented_page_data = read_augmented_csv()
for row in augmented_page_data:
    print(row)


['country', 'article_name', 'revision_id', 'article_quality', 'population']
['Afghanistan', 'Laghman Province', '778690357', 'Start', '32247000']
['Afghanistan', 'Roqia Abubakr', '779839643', 'Stub', '32247000']
['Afghanistan', 'Sitara Achakzai', '803055503', 'GA', '32247000']
['Afghanistan', 'Khadija Ahrari', '805920528', 'GA', '32247000']
['Afghanistan', 'Rahila Bibi Kobra Alamshahi', '717743144', 'Stub', '32247000']
['Afghanistan', 'Shahla Ata', '787347770', 'C', '32247000']
['Afghanistan', 'Salamat Azimi', '759405630', 'Start', '32247000']
['Afghanistan', 'Shukria Barakzai', '787258680', 'Start', '32247000']
['Afghanistan', 'Delbar Nazari', '805967589', 'B', '32247000']
['Afghanistan', 'Maryam Durani', '795223533', 'C', '32247000']
['Afghanistan', 'Masuma Esmati-Wardak', '779389809', 'Stub', '32247000']
['Afghanistan', 'Frozan Fana', '789039267', 'Stub', '32247000']
['Afghanistan', 'Fauzia Gailani', '799980094', 'C', '32247000']
['Afghanistan', 'Husn Banu Ghazanfar', '779389407', '