In [1]:
"""
TODO: Fill this in.
"""

import csv
import os.path
import requests

AUGMENTED_PAGE_DATA_PATH = 'data-512-a2.csv'

COUNTRY_MAP = {
    "East Timorese" : "Timor-Leste",
    "Hondura" : "Honduras",
    "Rhodesian" : "Zimbabwe",
    "Salvadoran" : "El Salvador",
    "Samoan" : "Samoa",
    "São Tomé and Príncipe" : "Sao Tome and Principe",
    "Somaliland" : "Somalia",
    "South African Republic" : "South Africa",
    "South Korean" : "Korea, South"
}

# Miscellaneous Constants
DEFAULT_ROW_COUNT = 2
MODEL = 'wp10'
PAGE_DATA_PATH = 'page_data.csv'
POPULATION_DATA_PATH = 'Population Mid-2015.csv'
PER_CALL = 140
PROJECT = 'enwiki'

# Augmented Page Data Fields
AUG_COUNTRY = 0
AUG_PAGE = 1
AUG_REVISION_ID = 2
AUG_QUALITY = 3
AUG_POPULATION = 4

# Count Fields
CNT_COUNTRY = 0
CNT_POPULATION = 1
CNT_ARTICLES = 2
CNT_ARTICLE_PCT = 3
CNT_HQ_ARTICLES = 4
CNT_HQ_ARTICLE_PCT = 5

# Page Data Fields
PDT_COUNTRY = 0
PDT_PAGE = 1
PDT_REVISION_ID = 2

# Population Fields
POP_COUNTRY = 0
POP_CNT_TYPE = 1
POP_TIMEFRAME = 2
POP_DATATYPE = 3
POP_DATA = 4
POP_FOOTNOTES = 5


def augment_page_data(page_data, quality_dictionary, population_dictionary):
    """
    Augments page data with article quality and the population of the country
    in which the subject resides.

    @param page_data: The original page data
    @type page_data: list
    @param quality_dictionary: An article quality dictionary, indexed by
    revision ID
    @type quality_dictionary: dict
    @param population_dictionary: A population dictionary, indexed by country
    @type population_dictionary: dict
    @return: Page data augmented with article quality and the population of the
    country in which the subject resides
    @rtype: list
    """

    # Declare and initialize a dictionary of missing countries, and a list to
    # received the augmented page data.
    missing = {}
    new_page_data = [['country',
                      'article_name',
                      'revision_id',
                      'article_quality',
                      'population']]

    # Cycle for each row in the page data.
    for index in range(1, len(page_data)):

        # Get the indexed row.  Get the article revision and country name
        # for the first/next row.
        row = page_data[index]
        article_revision = row[PDT_REVISION_ID]
        country_name = get_country(row[PDT_COUNTRY])

        # Write a message if the article revision is not in the quality
        # dictionary.  This really should not happen.
        if article_revision not in quality_dictionary:
            print('Missing quality entry for revision ID \'%s\'.' %
                  article_revision)

        # The article revision is in the quality dictionary.
        else:

            # Initialize, or increment the count of articles for the
            # given country name if the country name is not in the
            # population dictionary.
            if country_name not in population_dictionary:

                if country_name not in missing:
                    missing[country_name] = 1
                else:
                    missing[country_name] += 1

            # The country is in the population dictionary.  Create
            # an augmented page data row.
            else:
                new_page_data.append([country_name,
                                      row[PDT_PAGE],
                                      article_revision,
                                      quality_dictionary[article_revision],
                                      population_dictionary[country_name]])

    # Describe the counts of articles for 'countries' that were missing
    # a population in the population dictionary.  Return the augmented page
    # data.
    print('The following is the counts of articles about persons in countries '
          'that are missing a registered population: %s' % missing)
    return new_page_data


def build_country_to_population(country_data):
    """
    Builds a dictionary of countries to their populations.

    @param country_data: A list of countries with name as the first field,
    and population as the fifth field
    @type country_data: list
    @return: A dictionary of countries to their population
    @rtype: dict
    """

    # Declare and initialize the population dictionary, and cycle for each
    # country in the list.
    population_dictionary = {}
    for index in range(3, len(country_data) - 1):

        # Add a new dictionary for the first/next country.
        population_dictionary[country_data[index][POP_COUNTRY]] =\
            int(country_data[index][POP_DATA].replace(',', ''))

    # Return the population dictionary.
    return population_dictionary


def calculate_percentages(counts):
    """
    Calculates the percentage of articles per population, and the percentage of
    high-quality articles for a country dictionary, list or tuple.

    @param counts: A country dictionary, list or tuple
    @type counts: dict, list or tuple
    @return: None
    @rtype: None
    """

    # Declare and initialize a percent multiplier.  Cycle for each country.
    percent = 100.
    for country in counts:

        # Get the value list for the first/next country.  Get the article count
        # and population from the list.
        value = counts[country]
        article_count = value[CNT_ARTICLES]
        population = value[CNT_POPULATION]

        # Calculate the percentage of articles per population if the population
        # is greater than zero.
        if population > 0:
            value[CNT_ARTICLE_PCT] = article_count / population * percent

        # Calculate the percentage of high-quality articles if there are one or
        # more articles.
        if article_count > 0:
            value[CNT_HQ_ARTICLE_PCT] = value[CNT_HQ_ARTICLES] / article_count * percent

    # Done, so return.
    return


def create_augmented_page_data():
    """
    Creates the augmented page data file.

    @return: None
    @rtype: None
    """

    # Read the page data from CSV.  Create the page quality map, and the
    # country-to-population map.  Using all of these, create the augmented
    # page data and write it to CSV.
    page_data = read_from_csv(PAGE_DATA_PATH)
    write_to_csv(AUGMENTED_PAGE_DATA_PATH,
                 augment_page_data(page_data,
                                   get_quality_all(page_data, 101), # ),
                                   build_country_to_population(
                                       read_from_csv(POPULATION_DATA_PATH))))


def create_country_dictionary():
    """
    Creates a dictionary of countries, and statistics about them.
    Precondition: The augmented page data file exists, and is formatted
    correctly.

    @return: A dictionary of countries, and statistics about them
    @rtype: dict
    """

    # Here is the current list of fields for values in the dictionary:
    #
    # CNT_COUNTRY
    # CNT_POPULATION
    # CNT_ARTICLES
    # CNT_ARTICLE_PCT
    # CNT_HQ_ARTICLES
    # CNT_HQ_ARTICLE_PCT

    # Initialize an empty country dictionary.  Read rows from the augmented
    # page data file.
    country_dictionary = {}
    augmented_page_data = read_augmented_csv()

    # Delete the header row from the augmented page data.  Cycle for each
    # remaining row in the file.
    del augmented_page_data[0]
    for data_row in augmented_page_data:

        # Extract the country name from the row.  Is there an existing entry
        # in the country dictionary.  Get it if so.
        country = data_row[AUG_COUNTRY]
        if country in country_dictionary:
            country_row = country_dictionary[country]

        # There is no existing entry in the country dictionary.  Create one
        # with initial values.
        else:
            country_row = [country, int(data_row[AUG_POPULATION]),
                           0, 0., 0, 0.]

        # Increment the count of articles for the given country.
        country_row[CNT_ARTICLES] += 1

        # Get the quality from the data row.  Increment the count of high-
        # quality articles if the article has a high-quality rating.
        quality = data_row[AUG_QUALITY]
        if quality == 'FA' or quality == 'GA':
            country_row[CNT_HQ_ARTICLES] += 1

        # Return, or add the country value to the country dictionary indexed
        # by country.
        country_dictionary[country] = country_row

    # Calculate the percentage of articles per population, and the percentage
    # of high-quality articles.
    calculate_percentages(country_dictionary)
    return country_dictionary


def get_article_percentage(country):
    """
    Gets the percentage of articles to population from a list.

    @param country: A country attributes entry
    @type country: list
    @return: Percentage of articles to population
    @rtype: float
    """
    return country[CNT_ARTICLE_PCT]


def get_article_quality(article_quality, revision_ids):
    """
    Gets predicted article quality for a series of revision IDs.  Returns a dictionary
    indexed by revision ID.  Possible values for each revision ID are:

    FA - Featured article
    GA - Good article
    B - B-class article
    C - C-class article
    Start - Start-class article
    Stub - Stub-class article

    @param article_quality: An existing dictionary of revision IDs to
    article quality
    @type article_quality: dictionary
    @param revision_ids: A series of revision IDs
    @type revision_ids: list or tuple
    @return: article_quality
    @rtype: dict
    """

    # Hardcoded endpoint for the ORES API
    endpoint = 'https://ores.wikimedia.org/v3/scores/{project}/?models={model}&revids={revids}'

    # The parameters to be passed to the ORES API
    params = {'project': PROJECT,
              'model': MODEL,
              'revids': '|'.join(str(x) for x in revision_ids)
             }

    # Call the API, and return the response as JSON.
    api_call = requests.get(endpoint.format(**params))
    response = api_call.json()

    # Build and return a dictionary of article quality predictions
    # indexed by revision ID. Return the article quality dictionary.
    for key, value in response[PROJECT]['scores'].items():
        article_quality[key] = value[MODEL]['score']['prediction']
    return article_quality


def get_country(country):
    """
    Determines if a given country is mapped to another name.

    @param country: A given country
    @type country: str
    @return: A mapped country name if a name exists in the country map,
    the unmapped parameter otherwise
    @rtype: str
    """

    # Reset the country name if a name exists in the country map, and
    # return the country.
    if country in COUNTRY_MAP:
        country = COUNTRY_MAP[country]
    return country


def get_hq_article_percentage(country):
    """
    Gets the percentage of high-quality articles from a list.

    @param country: A country attributes entry
    @type country: list
    @return: Percentage of high-quality articles from a list
    @rtype: float
    """
    return country[CNT_HQ_ARTICLE_PCT]


def get_quality_all(page_data, last_index=DEFAULT_ROW_COUNT):
    """
    Gets article quality for all revision IDs in a page data list, up
    to a given maximum.

    @param page_data: A page data list, formatted with revision ID as the
    third element in each row
    @type page_data: list or tuple
    @param last_index: The last index to consider
    @type last_index: int
    @return: article_quality
    @rtype: dict
    """

    # Use the the full length of the page data if the last index is less than
    # a minimum number of rows.
    if last_index <= DEFAULT_ROW_COUNT:
        last_index = len(page_data)

    # Declare and initialize the quality dictionary, and determine the number
    # of iterative calls.
    quality_dictionary = {}
    calls = last_index // PER_CALL

    # Declare and initialize the base index, and cycle for the given number of
    # full calls required to retrieve the indicated number of rows.
    base = 1
    for _ in range(0, calls):

        # Calculate the last index, and print a message.
        count = base + PER_CALL
        print('Retrieving quality rating for articles %d to %d...'
              % (base, count - 1))

        # Update the quality dictionary.
        quality_dictionary = make_quality_call(quality_dictionary,
                                               page_data,
                                               base,
                                               count)

        # Update the base index.
        base = count

    # Is the base index less than the last index?  If so, there is
    # a remaining number of rows...
    if base < last_index:

        # Print a message.
        print('Retrieving quality rating for articles %d to %d...' %
              (base, last_index - 1))

        # Update the quality dictionary with the remaining number of rows.
        quality_dictionary = make_quality_call(quality_dictionary,
                                               page_data,
                                               base,
                                               last_index)

    # Describe how long the dictionary is, and return is.
    print('Length of quality dictionary is %d' % len(quality_dictionary))
    return quality_dictionary


def make_quality_call(existing_dictionary, page_data, start, stop):
    """
    Makes a call to get article quality for a given set of indices into a page
    data list.

    @param existing_dictionary: An existing dictionary of quality entries
    indexed by revision ID
    @type existing_dictionary: dictionary
    @param page_data: A page data list, formatted with revision ID as the
    third element in each row
    @type page_data: list or tuple
    @param start: The first index to use, inclusive
    @type start: int
    @param stop: The last index, exclusive
    @type stop: int
    @return: article_quality
    @rtype: dict
    """

    # Declare and initialize an empty list of revision IDs.  Cycle for each row
    # in the given range.  Append the first/next ID to the list.
    ids = []
    for row in range(start, stop):
        ids.append(page_data[row][PDT_REVISION_ID])

    # Get article quality for the selected revision IDs.
    return get_article_quality(existing_dictionary, ids)


def read_augmented_csv():
    """
    Reads fields from the augmented page data file.

    @return: The rows read from the file
    @rtype: list
    """

    # Create the augmented page data file if it does not already exist.
    if not os.path.isfile(AUGMENTED_PAGE_DATA_PATH):
        create_augmented_page_data()

    # Read the file, and return the rows.
    return read_from_csv(AUGMENTED_PAGE_DATA_PATH)


def read_from_csv(file_name):
    """
    Reads fields from a CSV file.

    @param file_name: A file path.
    @type file_name: str
    @return: The rows read from the file
    @rtype: list
    """

    # Declare and initialize a empty row list.  Open a CSV reader using the
    # given file name.
    row_list = []
    with (open(file_name)) as csvfile:
        reader = csv.reader(csvfile)

        # Append the row for each row read by the reader.
        for row in reader:
            row_list.append(row)

    # Return the row list.
    return row_list


def write_to_csv(file_name, row_list):
    """
    Writes fields to a CSV file.

    @param file_name: A file path.
    @type file_name: str
    @param row_list: The rows to write to the file
    @type row_list: list
    """

    # Open a CSV writer using the given file name.  Write the given rows.
    with(open(file_name, 'w')) as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(row_list)


print(sorted(list(create_country_dictionary().values()),
             key=get_article_percentage,
             reverse=True))

print(sorted(list(create_country_dictionary().values()),
             key=get_hq_article_percentage,
             reverse=True))


[['Tuvalu', 11800, 55, 0.46610169491525427, 0, 0.0], ['Nauru', 10860, 49, 0.45119705340699817, 0, 0.0], ['San Marino', 33000, 87, 0.2636363636363636, 0, 0.0], ['Monaco', 38088, 43, 0.11289645032556185, 0, 0.0], ['Liechtenstein', 37570, 29, 0.07718924673941975, 2, 6.896551724137931], ['Marshall Islands', 55000, 37, 0.06727272727272728, 1, 2.7027027027027026], ['Iceland', 330828, 206, 0.06226800633561851, 2, 0.9708737864077669], ['Tonga', 103300, 63, 0.06098741529525654, 1, 1.5873015873015872], ['Andorra', 78000, 34, 0.04358974358974359, 0, 0.0], ['Samoa', 194210, 78, 0.04016271046805005, 1, 1.282051282051282], ['Federated States of Micronesia', 103000, 38, 0.036893203883495145, 0, 0.0], ['Grenada', 111000, 36, 0.032432432432432434, 0, 0.0], ['Luxembourg', 569202, 180, 0.0316232198762478, 2, 1.1111111111111112], ['Kiribati', 113400, 33, 0.0291005291005291, 0, 0.0], ['Antigua and Barbuda', 90000, 25, 0.027777777777777776, 0, 0.0], ['Malta', 431486, 103, 0.023870994655678282, 0, 0.0], ['Se