## Step 1: Gathering the Data

In [1]:
import os
import shutil
import pandas as pd

import json
import requests

In [2]:
endpoint_legacy = 'https://wikimedia.org/api/rest_v1/metrics/legacy/pagecounts/aggregate/{project}/{access-site}/{granularity}/{start}/{end}'

endpoint_pageviews = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'

In [3]:
json_output_folder = "data/json"
csv_output_folder = "data/csv"
if os.path.exists("data"):
    shutil.rmtree("data")
os.mkdir("data")
os.mkdir(json_output_folder)
os.mkdir(csv_output_folder)

In [4]:
# SAMPLE parameters for getting aggregated legacy view data 
# see: https://wikimedia.org/api/rest_v1/#!/Legacy_data/get_metrics_legacy_pagecounts_aggregate_project_access_site_granularity_start_end
example_params_legacy = {"project" : "en.wikipedia.org",
                 "access-site" : "desktop-site",
                 "granularity" : "monthly",
                 "start" : "2008010100",
                # for end use 1st day of month following final month of data
                 "end" : "2020100100"
                    }

# SAMPLE parameters for getting aggregated current standard pageview data
# see: https://wikimedia.org/api/rest_v1/#!/Pageviews_data/get_metrics_pageviews_aggregate_project_access_agent_granularity_start_end
example_params_pageviews = {"project" : "en.wikipedia.org",
                    "access" : "desktop",
                    "agent" : "user",
                    "granularity" : "monthly",
                    "start" : "2008010100",
                    # for end use 1st day of month following final month of data
                    "end" : '2020100100'
                        }

# Customize these with your own information
headers = {
    'User-Agent': 'https://github.com/chavi-g',
    'From': 'chavig@uw.edu'
}


In [5]:
def get_pagecount_params(access_site, start, end):
    params = {
                "project" : "en.wikipedia.org",
                "access-site" : access_site,
                "granularity" : "monthly",
                "start" : start,
                "end" : end
           }
    return params

In [6]:
def get_pageview_params(access, start, end):
    params = {
                "project" : "en.wikipedia.org",
                "access" : access,
                "agent" : "user",
                "granularity" : "monthly",
                "start" : start,
                "end" : end
           }
    return params

In [7]:
def api_call(endpoint,parameters):
    call = requests.get(endpoint.format(**parameters), headers=headers)
    response = call.json()
    
    return response

In [8]:
def get_json_file_path(apiname, accesstype, firstmonth, lastmonth):
    filename = "{0}_{1}_{2}-{3}.json".format(apiname, accesstype, firstmonth, lastmonth)
    return os.path.join(json_output_folder, filename)

In [9]:
start, end = "2008010100", "2020100100"
pageview_desktop_file = api_call(endpoint_pageviews, get_pageview_params("desktop", start, end))
pageview_mobile_file = api_call(endpoint_pageviews, get_pageview_params("mobile-app", start, end))
pageview_web_file = api_call(endpoint_pageviews, get_pageview_params("mobile-web", start, end))

In [10]:
pagecount_desktop_file = api_call(endpoint_legacy, get_pagecount_params("desktop-site", start, end))
pagecount_mobile_file = api_call(endpoint_legacy, get_pagecount_params("mobile-site", start, end))

In [13]:
def write_data_to_json(data, filename):
    data_string = json.dumps(data, indent=2)
    with open(filename, "w") as f:
        f.write(data_string)

In [14]:
firstmonth = start[:6]
endmonth = end[:6]
get_json_file_path('pageviews','desktop', firstmonth, endmonth)

'data/json/pageviews_desktop_200801-202010.json'

In [15]:
def get_pageviews_data(access, start, end):
    pageview_data = api_call(endpoint_pageviews, get_pageview_params(access, start, end))
    firstmonth = start[:6]
    endmonth = end[:6]
    filename = get_json_file_path('pageviews', access, firstmonth, endmonth)
    write_data_to_json(pageview_data, filename)
    
    return filename

In [16]:
def get_pagecounts_data(access_site, start, end):
    pagecount_data = api_call(endpoint_legacy, get_pagecount_params(access_site, start, end))
    firstmonth = start[:6]
    endmonth = end[:6]
    filename = get_json_file_path('pagecounts', access_site, firstmonth, endmonth)
    write_data_to_json(pagecount_data, filename)
    
    return filename

In [21]:
start, end = "2008010100", "2020100100"
pageview_desktop_file = get_pageviews_data("desktop", start, end)
pageview_mobile_app_file = get_pageviews_data("mobile-app", start, end)
pageview_mobile_web_file = get_pageviews_data("mobile-web", start, end)

In [22]:
pagecount_desktop_file = get_pagecounts_data("desktop-site", start, end)
pagecount_mobile_file = get_pagecounts_data("mobile-site", start, end)

## Step 2: Processing the Data

In [24]:
def read_data_from_json(filename):
    data = {}
    with open(filename, "r") as f:
        data = json.loads(f.read())
    
    return data

In [25]:
pageview_desktop_data = read_data_from_json(pageview_desktop_file)
pageview_mobile_app_data = read_data_from_json(pageview_mobile_app_file)
pageview_mobile_web_data = read_data_from_json(pageview_mobile_web_file)
pagecount_desktop_data = read_data_from_json(pagecount_desktop_file)
pagecount_mobile_data = read_data_from_json(pagecount_mobile_file)

In [59]:
pageview_desktop_views = pd.DataFrame(pageview_desktop_data['items'])
pageview_desktop_views['year'] = pageview_desktop_views.apply(lambda row: row['timestamp'][:4], axis = 1)
pageview_desktop_views['month'] = pageview_desktop_views.apply(lambda row: row['timestamp'][4:6], axis = 1)

In [60]:
pageview_desktop_views = pageview_desktop_views[['year', 'month', 'views']]

In [61]:
pageview_desktop_views

Unnamed: 0,year,month,views
0,2015,07,4376666686
1,2015,08,4332482183
2,2015,09,4485491704
3,2015,10,4477532755
4,2015,11,4287720220
...,...,...,...
58,2020,05,3078093615
59,2020,06,2721328557
60,2020,07,2638936132
61,2020,08,2613058239


In [37]:
pageview_mobile_app_views = pd.DataFrame(pageview_mobile_app_data['items'])
pageview_mobile_app_views['year'] = pageview_mobile_app_views.apply(lambda row: row['timestamp'][:4], axis = 1)
pageview_mobile_app_views['month'] = pageview_mobile_app_views.apply(lambda row: row['timestamp'][4:6], axis = 1)

In [44]:
pageview_mobile_app_views = pageview_mobile_app_views[['year', 'month', 'views']]

In [45]:
pageview_mobile_web_views = pd.DataFrame(pageview_mobile_web_data['items'])
pageview_mobile_web_views['year'] = pageview_mobile_web_views.apply(lambda row: row['timestamp'][:4], axis = 1)
pageview_mobile_web_views['month'] = pageview_mobile_web_views.apply(lambda row: row['timestamp'][4:6], axis = 1)

In [46]:
pageview_mobile_web_views = pageview_mobile_web_views[['year', 'month', 'views']]

In [51]:
merged_pageview_mobile_views = pd.merge(pageview_mobile_app_views,pageview_mobile_web_views, on = ['year', 'month'], how = 'outer' )

In [54]:
merged_pageview_mobile_views['views'] = merged_pageview_mobile_views.apply(lambda row: row['views_x'] + row['views_y'], axis = 1)

In [56]:
pageview_mobile_views = merged_pageview_mobile_views[['year', 'month', 'views']]

In [62]:
pageview_mobile_views

Unnamed: 0,year,month,views
0,2015,07,3288755294
1,2015,08,3302333038
2,2015,09,3170203333
3,2015,10,3268499132
4,2015,11,3236601070
...,...,...,...
58,2020,05,5231700095
59,2020,06,4573975256
60,2020,07,4809714465
61,2020,08,4803308661


In [None]:
pagecount