# English Wikipedia page views, 2008 - 2017

For this assignment, your job is to analyze traffic on English Wikipedia over time, and then document your process and the resulting dataset and visualization according to best practices for open research that were outlined for you in class.

### Example API request
You can use this example API request as a starting point for building your API queries. Note that the [Legacy Pagecounts API](https://wikitech.wikimedia.org/wiki/Analytics/AQS/Legacy_Pagecounts) has slightly different schema than the [pageview API](https://wikitech.wikimedia.org/wiki/Analytics/AQS/Pageviews) shown here.

This sample API request would get you all pageviews by web crawlers on the mobile website for English Wikipedia during the month of September, 2017.

In [64]:
import copy
import csv
import datetime
from dateutil.relativedelta import relativedelta
import json
import requests

endpoint_pagecounts = 'https://wikimedia.org/api/rest_v1/metrics/legacy/pagecounts/aggregate/{project}/{access-site}/{granularity}/{start}/{end}'
endpoint_pageviews  = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'

headers = {'User-Agent': 'https://github.com/garygr2002', 'From': 'garygr@uw.edu'}

def format_date_long(date):
    return date.strftime('%Y%m%d%H')


def format_date_short(date):
    return date.strftime('%Y%m')


def get_page_counts(access, start_date, end_date):
    
    params = {'project' : 'en.wikipedia.org',
              'access-site' : access,
              'granularity' : 'monthly',
              'start' : format_date_long(start_date),
              'end' : format_date_long(end_date)
             }
    
    api_call = requests.get(endpoint_pagecounts.format(**params))
    return api_call.json()


def get_page_views(access, start_date, end_date):
    
    params = {'project' : 'en.wikipedia.org',
              'access' : access,
              'agent' : 'user',
              'granularity' : 'monthly',
              'start' : format_date_long(start_date),
              'end' : format_date_long(end_date)
             }
    
    api_call = requests.get(endpoint_pageviews.format(**params))
    return api_call.json()


pagecount_all_views = 'pagecount_all_views'
pagecount_desktop_views = 'pagecount_desktop_views'
pagecount_mobile_views = 'pagecount_mobile_views'
pageview_all_views = 'pageview_all_views'
pageview_desktop_views = 'pageview_desktop_views'
pageview_mobile_views = 'pageview_mobile_views'

key_lookup_dictionary = {'pagecounts/desktop-site' : pagecount_desktop_views,
                        'pagecounts/mobile-site' : pagecount_mobile_views,
                        'pageviews/desktop' : pageview_desktop_views,
                        'pageviews/mobile-app' : pageview_mobile_views,
                        'pageviews/mobile-web' : pageview_mobile_views}

initial_traffic = {pagecount_all_views : 0,
                  pagecount_desktop_views : 0,
                  pagecount_mobile_views : 0,
                  pageview_all_views : 0,
                  pageview_desktop_views : 0,
                  pageview_mobile_views : 0}

initial_traffic_dictionary = { }
traffic_dictionary = initial_traffic_dictionary

api_name_current = 'pageviews'
api_name_legacy = 'pagecounts'

def access_count(api_name, api_data):
    
    if (api_name == 'pageviews'):
        key = 'views'
    else:
        key = 'count'
    return api_data.get('items', None)[0][key]


def clear_traffic_dictionary():
    traffic_dictionary = initial_traffic_dictionary

    
def update_traffic_dictionary(start_date, api_name, access, api_data):
    
    date_key = format_date_short(start_date)
    traffic = traffic_dictionary.get(date_key, copy.deepcopy(initial_traffic))
    access_key = key_lookup_dictionary['{}/{}'.format(api_name, access)]
    count = traffic.get(access_key, 0)
    count += access_count(api_name, api_data)
    traffic[access_key] = count
    traffic_dictionary[date_key] = traffic

    
def calculate_traffic_totals():
    
    for key, val in traffic_dictionary.items():
        val[pagecount_all_views] = val[pagecount_desktop_views] + val[pagecount_mobile_views]
        val[pageview_all_views] = val[pageview_desktop_views] + val[pageview_mobile_views]


def perform_api_call(api_name, function, access, initial_start_date, iterations):

    start_date = initial_start_date
    traffic = { }
    for i in range(0, iterations):
        # print('Request starting \'{}\'...'.format(start_date.strftime('%Y%m%d')))
        end_date = start_date + relativedelta(months=+1)
        api_data = function(access, start_date, end_date)
        update_traffic_dictionary(start_date, api_name, access, api_data)
        traffic[start_date.strftime('%Y%m%d')] = api_data
        start_date = end_date
        
    with open('{}_{}_{}_{}.json'.format(api_name,
                                        access,
                                        format_date_short(initial_start_date),
                                        format_date_short(end_date)),
              'w') as outfile:  
        json.dump(traffic, outfile)
    
    
key_access = 'access'
key_api_name = 'apiname'
key_function = 'function'
key_months = 'months'
key_start_date = 'startdate'

legacy_mobile = {key_api_name : api_name_legacy,
                 key_function : get_page_counts,
                 key_access : 'mobile-site',
                 key_start_date : datetime.datetime(2014, 10, 1, 0, 0, 0),
                 key_months : 23 }

legacy_desktop = {key_api_name : api_name_legacy,
                 key_function : get_page_counts,
                 key_access : 'desktop-site',
                 key_start_date : datetime.datetime(2008, 1, 1, 0, 0, 0),
                 key_months : 104 }

current_desktop = {key_api_name : api_name_current,
                   key_function : get_page_views,
                   key_access : 'desktop',
                   key_start_date : datetime.datetime(2015, 7, 1, 0, 0, 0),
                   key_months : 26 }

current_mobile_app = {key_api_name : api_name_current,
                      key_function : get_page_views,
                      key_access : 'mobile-app',
                      key_start_date : datetime.datetime(2015, 7, 1, 0, 0, 0),
                      key_months : 26 }

current_mobile_web = {key_api_name : api_name_current,
                      key_function : get_page_views,
                      key_access : 'mobile-web',
                      key_start_date : datetime.datetime(2015, 7, 1, 0, 0, 0),
                      key_months : 26 }

access_types = [legacy_desktop, legacy_mobile, current_desktop, current_mobile_web, current_mobile_app]

clear_traffic_dictionary()
for access_type in access_types:
    print('Starting a new one...')
    perform_api_call(access_type[key_api_name],
                     access_type[key_function],
                     access_type[key_access],
                     access_type[key_start_date],
                     access_type[key_months])

calculate_traffic_totals()
with open('en-wikipedia_traffic_200801-201709.csv', 'w', newline='') as csvfile:
    
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(['year', 'month', pagecount_all_views, pagecount_desktop_views, pagecount_mobile_views,
                        pageview_all_views, pageview_desktop_views, pageview_mobile_views])
    
    sorted_keys = sorted(traffic_dictionary)
    for key in sorted_keys:
        
        item = traffic_dictionary[key]
        csvwriter.writerow([key[:4],
                            key[4:],
                            item[pagecount_all_views],
                            item[pagecount_desktop_views],
                            item[pagecount_mobile_views],
                            item[pageview_all_views],
                            item[pageview_desktop_views],
                            item[pageview_mobile_views]])


Starting a new one...
Starting a new one...
Starting a new one...
Starting a new one...
Starting a new one...
