# Step 1: Data Adqiuisition ##
## Obtaining data from API
We access page view data using the [Wikimedia REST API](https://www.mediawiki.org/wiki/Wikimedia_REST_API). We request monthly counts of page views for multiple articles. 


In [1]:
# These are standard python modules
import json, time, urllib.parse, pandas as pd
#
# The 'requests' module is not a standard Python module. You will need to install this with pip/pip3 if you do not already have it
import requests

We declare all necessary constants for the request

In [2]:
#########
#
#    CONSTANTS
#

# The REST API 'pageviews' URL - this is the common URL/endpoint for all 'pageviews' API requests
API_REQUEST_PAGEVIEWS_ENDPOINT = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/'

# This is a parameterized string that specifies what kind of pageviews request we are going to make
# In this case it will be a 'per-article' based request. The string is a format string so that we can
# replace each parameter with an appropriate value before making the request
API_REQUEST_PER_ARTICLE_PARAMS = 'per-article/{project}/{access}/{agent}/{article}/{granularity}/{start}/{end}'

# The Pageviews API asks that we not exceed 100 requests per second, we add a small delay to each request
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making a request to the Wikimedia API they ask that you include a "unique ID" that will allow them to
# contact you if something happens - such as - your code exceeding request limits - or some other error happens
REQUEST_HEADERS = {
    'User-Agent': '<uwnetid@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2022',
}

# This template is used to map parameter values into the API_REQUST_PER_ARTICLE_PARAMS portion of an API request. The dictionary has a
# field/key for each of the required parameters. In the example, below, we only vary the article name, so the majority of the fields
# can stay constant for each request. Of course, these values *could* be changed if necessary.
ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE = {
    "project":     "en.wikipedia.org",
    "access":      "desktop",      # this should be changed for the different access types
    "agent":       "user",
    "article":     "",             # this value will be set/changed before each request
    "granularity": "monthly",
    "start":       "2015070100",
    "end":         "2022100100"    # this is likely the wrong end date
}


We define the function necessary to make the request to the API

In [3]:
#########
#
#    PROCEDURES/FUNCTIONS
#

def request_pageviews_per_article(article_title = None, 
                                  endpoint_url = API_REQUEST_PAGEVIEWS_ENDPOINT, 
                                  endpoint_params = API_REQUEST_PER_ARTICLE_PARAMS, 
                                  request_template = ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE,
                                  headers = REQUEST_HEADERS):
    # Make sure we have an article title
    if not article_title: return None
    
    # Titles are supposed to have spaces replaced with "_" and be URL encoded
    article_title_encoded = urllib.parse.quote(article_title.replace(' ','_'))
    request_template['article'] = article_title_encoded
    
    # now, create a request URL by combining the endpoint_url with the parameters for the request
    request_url = endpoint_url+endpoint_params.format(**request_template)
    
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like Wikipedia - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response


We take the list of dinosaur names that we want to search for from the input csv file, this file must be in the 'raw_data' folder. If the names have quotes inside, this procedure will correct them so that they are readable 

In [4]:
dinosaur_names = pd.read_csv('../data_raw/dinosaur_genera.cleaned.SEPT.2022 - dinosaur_genera.cleaned.SEPT.2022.csv')
#quotes from excel are wrongly read, so we correct it
dinosaur_names_list = dinosaur_names['name'].tolist()
for i in range(len(dinosaur_names_list)):
    if '“' in dinosaur_names_list[i]:
        chars = '“”'
        for c in chars:
            dinosaur_names_list[i] = dinosaur_names_list[i].replace(c,'"')

For desktop, mobile and monthly cumulative we retrieve the data from the API using the functions declared previously. We then partially clean the data by removing the access field which is unnecessary for our analysis. Finally, we save the data into json files located in the 'data_clean' folder. 

In [5]:
#desktop data
ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE["access"]="desktop"
desktop_dinosaur_dict={}
for dinosaur_name in dinosaur_names_list:
    months = request_pageviews_per_article(dinosaur_name)['items']
    for month in months:
        #We remove the 'access' field
        del month['access']
    #We assign the monthly information to the corresponding dino key inside the dictionary
    desktop_dinosaur_dict[dinosaur_name] = months
with open('../data_clean/dino_monthly_desktop_{}-{}.json'.format(ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE["start"][:6],ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE["end"][:6]), 'w', encoding='utf-8') as f:
    #We save and transform the dictionary into JSON
    json.dump(desktop_dinosaur_dict, f, ensure_ascii=False, indent=4)
    

For mobile data, we need to retrieve both mobile-web and mobile-app data, we sum these two to have only one count for all mobile activity

In [30]:
#mobile data
mobile_dinosaur_dict={}
for dinosaur_name in dinosaur_names_list:
    ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE["access"]="mobile-web"
    mobile_web_months = request_pageviews_per_article(dinosaur_name)['items']
    ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE["access"]="mobile-app"
    mobile_app_months = request_pageviews_per_article(dinosaur_name)['items']
    #We make sure that for each dino, there is same amount of month information.
    assert len(mobile_web_months) == len(mobile_app_months)
    for i in range(len(mobile_web_months)):
        #We make sure that we are summing over the same month
        assert mobile_web_months[i]['timestamp'] ==  mobile_app_months[i]['timestamp'] 
        #We sum mobile-web and mobile-app views
        mobile_web_months[i]['views'] = mobile_web_months[i]['views'] + mobile_app_months[i]['views']
        #We remove the 'access' field
        del mobile_web_months[i]['access']
    mobile_months = mobile_web_months
    #We assign the monthly information to the corresponding dino key inside the dictionary
    mobile_dinosaur_dict[dinosaur_name] = mobile_months
with open('../data_clean/dino_monthly_mobile_{}-{}.json'.format(ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE["start"][:6],ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE["end"][:6]), 'w', encoding='utf-8') as f:
    #We save and transform the dictionary into JSON
    json.dump(mobile_dinosaur_dict, f, ensure_ascii=False, indent=4)

From the API documentation, they mention the possibility to extract "all-access" data which is already the sum of desktop and mobile. To improve readability and similicity, we decide to take these instead of summing the two previous .json. We additionally calculate the requested cumulative sum and store this value in the 'views' column

In [36]:
#monthly cumulative
ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE["access"]="all-access"
all_access_dinosaur_dict={}
for dinosaur_name in dinosaur_names_list:
    months = request_pageviews_per_article(dinosaur_name)['items']
    cum_sum=0
    for month in months:
        #We generate the cumulative sum
        cum_sum+=month['views']
        month['views']=cum_sum
        #We remove the 'access' field
        del month['access']
    all_access_dinosaur_dict[dinosaur_name] = months
with open('../data_clean/dino_monthly_cumulative_{}-{}.json'.format(ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE["start"][:6],ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE["end"][:6]), 'w', encoding='utf-8') as f:
    json.dump(all_access_dinosaur_dict, f, ensure_ascii=False, indent=4)
