# Data Collection
The code in this file aims to collect top academy award winning page information from wikimedia. This data is then split into three files for each access type of the pages. These files include the access types desktop and mobile while the third is the combined access data. Additionally, this file preprocesses the data before saving the files. The dates of interest in the data as of the time of this writing ranges from July 2015 to the end of September 2023.

### Endpoints and Constants
 The code in the following cell is adapted from code found [here](Adapted from https://drive.google.com/file/d/1XjFhd3eXx704tcdfQ4Q1OQn0LWKCRNJm/view). This cell lays out some constants and endpoints for collecting the data from the wikimedia API. The article names are collected from the thank_the_academy.AUG.2023.csv file.

In [None]:
############################################################################################################## 
# Adapted from https://drive.google.com/file/d/1XjFhd3eXx704tcdfQ4Q1OQn0LWKCRNJm/view under creative Commons # 
# Creative Commons https://creativecommons.org/licenses/by/4.0/                                              #
##############################################################################################################

# These are standard python modules
import json, time, urllib.parse
#
# The 'requests' module is not a standard Python module. You will need to install this with pip/pip3 if you do not already have it
import requests
import pandas as pd

#########
#
#    CONSTANTS
#

# The REST API 'pageviews' URL - this is the common URL/endpoint for all 'pageviews' API requests
API_REQUEST_PAGEVIEWS_ENDPOINT = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/'

# This is a parameterized string that specifies what kind of pageviews request we are going to make
# In this case it will be a 'per-article' based request. The string is a format string so that we can
# replace each parameter with an appropriate value before making the request
API_REQUEST_PER_ARTICLE_PARAMS = 'per-article/{project}/{access}/{agent}/{article}/{granularity}/{start}/{end}'

# The Pageviews API asks that we not exceed 100 requests per second, we add a small delay to each request
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making a request to the Wikimedia API they ask that you include your email address which will allow them
# to contact you if something happens - such as - your code exceeding rate limits - or some other error 
REQUEST_HEADERS = {
    'User-Agent': 'cjault@uw.edu, University of Washington, MSDS DATA 512 - AUTUMN 2023',
}

# This is just a list of English Wikipedia article titles that we can use for example requests
targets_df = pd.read_csv('data-512-homework_1/thank_the_academy.AUG.2023.csv')

ARTICLE_TITLES = targets_df['name'].tolist()

# This template is used to map parameter values into the API_REQUST_PER_ARTICLE_PARAMS portion of an API request. The dictionary has a
# field/key for each of the required parameters. In the example, below, we only vary the article name, so the majority of the fields
# can stay constant for each request. Of course, these values *could* be changed if necessary.
ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE = {
    "project":     "en.wikipedia.org",
    "access":      "desktop",      # this should be changed for the different access types
    "agent":       "user",
    "article":     "",             # this value will be set/changed before each request
    "granularity": "monthly",
    "start":       "2015070100",   # start and end dates need to be set
    "end":         "2023093000"    # this is likely the wrong end date
}

#########
#
#    PROCEDURES/FUNCTIONS
#

def request_pageviews_per_article(article_title = None, 
                                  access_type = None,
                                  endpoint_url = API_REQUEST_PAGEVIEWS_ENDPOINT, 
                                  endpoint_params = API_REQUEST_PER_ARTICLE_PARAMS, 
                                  request_template = ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE,
                                  headers = REQUEST_HEADERS):

    # article title can be as a parameter to the call or in the request_template
    if article_title:
        # encoded_article = urllib.parse.quote(article_title, safe='')
        request_template['article'] = article_title
    if access_type:
        request_template['access'] = access_type # desktop, mobile-app, mobile-web


    if not request_template['article']:
        raise Exception("Must supply an article title to make a pageviews request.")

    # Titles are supposed to have spaces replaced with "_" and be URL encoded
    article_title_encoded = urllib.parse.quote(request_template['article'].replace(' ','_'))
    # article_title_encoded = urllib.parse.quote(article_title_encoded, safe='')
    request_template['article'] = article_title_encoded

    
    # now, create a request URL by combining the endpoint_url with the parameters for the request
    request_url = endpoint_url+endpoint_params.format(**request_template)
    
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like Wikipedia - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response
#############################################################################################################


: 

### Data Collection and Storage
The following cell collects the API responses for the articles for both mobile and desktop views. The data is then preprocessed by dropping unnecessary columns and adjusting the timestamps for usability later. Results are stored in files called "academy_monthly_<access_type>_<startdate>-<enddate>.json"

In [None]:
data_desktop = []
data_mobile = []
data_total = []

for article in ARTICLE_TITLES:
    print(article)
    try:
        view_desktop = request_pageviews_per_article(article, 'desktop')
        views_total = request_pageviews_per_article(article, 'all-access')
        for i in range(len(view_desktop['items'])):
            data_desktop.append(view_desktop['items'][i])
            data_total.append(views_total['items'][i])
            data_mobile.append(views_total['items'][i])
            data_mobile[-1]['views'] - view_desktop['items'][i]['views']
    except:
        print(f'failed for {article}')
        print(view_desktop)

df_desktop = pd.DataFrame(data_desktop)
df_desktop = df_desktop.drop(['access'], axis=1)
df_desktop['timestamp'] = pd.to_datetime(df_desktop['timestamp'], format='%Y%m%d%H%M')


df_mobile = pd.DataFrame(data_mobile)
df_mobile = df_mobile.drop(['access'], axis=1)
df_mobile['timestamp'] = pd.to_datetime(df_mobile['timestamp'], format='%Y%m%d%H%M')

df_total = pd.DataFrame(data_total)
df_total = df_total.drop(['access'], axis=1)
df_total['timestamp'] = pd.to_datetime(df_total['timestamp'], format='%Y%m%d%H%M')

df_desktop.to_json('academy_monthly_desktop_201507-202309.json', orient='records', lines=True)
df_mobile.to_json('academy_monthly_mobile_201507-202309.json', orient='records', lines=True)
df_total.to_json('academy_monthly_cumulative_201507-202309.json', orient='records', lines=True)




: 