Christie Gan
10/9/20

To retrieve and process the raw data, I saved the API links into two variables, and set the parameters to view pagecounts and pageviews from both desktop, mobile app, and mobile web. 

In [115]:
import json
import requests
import csv
import pandas as pd
import plotly.graph_objects as go

#set variables for API links
endpoint_legacy = 'https://wikimedia.org/api/rest_v1/metrics/legacy/pagecounts/aggregate/{project}/{access-site}/{granularity}/{start}/{end}'

endpoint_pageviews = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'
 
#https://wikimedia.org/api/rest_v1/#!/Legacy_data/get_metrics_legacy_pagecounts_aggregate_project_access_site_granularity_start_end
params_legacy_desktop = {"project" : "en.wikipedia.org",
                 "access-site" : "desktop-site",
                 "granularity" : "monthly",
                 "start" : "2008010100",
                 "end" : "2016073100"
                    }

params_legacy_mobile = {"project" : "en.wikipedia.org",
                 "access-site" : "mobile-site",
                 "granularity" : "monthly",
                 "start" : "2008010100",
                 "end" : "2016073100"
                    }

#https://wikimedia.org/api/rest_v1/#!/Pageviews_data/get_metrics_pageviews_aggregate_project_access_agent_granularity_start_end
params_pageviews_desktop = {"project" : "en.wikipedia.org",
                    "access" : "desktop",
                    "agent" : "user",
                    "granularity" : "monthly",
                    "start" : "2015070100",
                    "end" : '2020083100'
                        }

params_pageviews_m_app = {"project" : "en.wikipedia.org",
                    "access" : "mobile-app",
                    "agent" : "user",
                    "granularity" : "monthly",
                    "start" : "2015070100",
                    "end" : '2020083100'
                        }

params_pageviews_m_web = {"project" : "en.wikipedia.org",
                    "access" : "mobile-web",
                    "agent" : "user",
                    "granularity" : "monthly",
                    "start" : "2015070100",
                    "end" : '2020083100'
                        }

headers = {
    'User-Agent': 'https://github.com/yourusername',
    'From': 'youremail@uw.edu'
}




I defined the api_call function so that I can pass in the parameters and api link defined in the previous cell to retrieve information from the json file.

In [116]:
def api_call(endpoint,parameters):
    call = requests.get(endpoint.format(**parameters), headers=headers)
    response = call.json()
    
    return response


After defining the api call function, I called the function for each of the given scenarios (pagecounts for desktop and mobile sites, and pageviews for desktop, mobile app and mobile website). The raw results from these api calls are put into separate JSON files.

In [117]:
#pagecounts
pagecounts_desktop_site_monthly_pageviews = api_call(endpoint_legacy, params_legacy_desktop)
with open('pagecounts_desktop-site_200801-201607.json', 'w') as outfile:
    json.dump(pagecounts_desktop_site_monthly_pageviews, outfile)
    
pagecounts_mobile_site_monthly_pageviews = api_call(endpoint_legacy, params_legacy_mobile)
with open('pagecounts_mobile-site_200801-201607.json', 'w') as outfile:
    json.dump(pagecounts_mobile_site_monthly_pageviews, outfile)
    
    
#pageviews
pageviews_desktop_site_monthly_pageviews = api_call(endpoint_pageviews, params_pageviews_desktop)
with open('pageviews_desktop-site_201507-202008.json', 'w') as outfile:
    json.dump(pageviews_desktop_site_monthly_pageviews, outfile)

pageviews_m_app_site_monthly_pageviews = api_call(endpoint_pageviews, params_pageviews_m_app)
with open('pageviews_m_app-site_201507-202008.json', 'w') as outfile:
    json.dump(pageviews_m_app_site_monthly_pageviews, outfile)
    
pageviews_m_web_site_monthly_pageviews = api_call(endpoint_pageviews, params_pageviews_m_web)
with open('pageviews_m_web-site_201507-202008.json', 'w') as outfile:
    json.dump(pageviews_m_web_site_monthly_pageviews, outfile)

A dictionary had to be made for both pagecounts data and pageviews data in order to combine all the data into a CSV file eventually. For each dictionary, I made the key to be the timestamp and the value as all the columns of data that will eventually be in the CSV file. Load the initial desktop site data into the dictionary, then update the dictionary to incluide mobile site data.

In [118]:
#create pagecounts dictionary by loading in desktop data first into relevant columns
with open('pagecounts_desktop-site_200801-201607.json') as inputfile:
    data = json.load(inputfile)

data = data['items']

my_dict_count = {}
count = 0

for item in data:
    my_dict_count[item['timestamp']] = {'year':item['timestamp'][0:4], 'month':item['timestamp'][4:6], 
                                  'pagecount_all_views':item['count'], 'pagecount_desktop_views':item['count'],
                                  'pagecount_mobile_views':0}
    
#update dictionary with mobile data
with open('pagecounts_mobile-site_200801-201607.json') as inputfile:
    data = json.load(inputfile)

data = data['items']

for item in data:
    existing_item = my_dict_count[item['timestamp']]
    existing_item['pagecount_all_views'] += item['count']
    existing_item['pagecount_mobile_views'] = item['count']
    my_dict_count[item['timestamp']] = existing_item

    
    
    
#create pageviews dictionary by loading in desktop data first into relevant columns
with open('pageviews_desktop-site_201507-202008.json') as inputfile:
    data = json.load(inputfile)

data = data['items']

my_dict_views = {}
count = 0

for item in data:
    my_dict_views[item['timestamp']] = {'year':item['timestamp'][0:4], 'month':item['timestamp'][4:6], 
                                  'pageview_all_views':item['views'], 'pageview_desktop_views':item['views'],
                                  'pageview_mobile_views':0}
    
#update dictionary with mobile app data
with open('pageviews_m_app-site_201507-202008.json') as inputfile:
    data = json.load(inputfile)

data = data['items']

for item in data:
    existing_item = my_dict_views[item['timestamp']]
    existing_item['pageview_all_views'] += item['views']
    existing_item['pageview_mobile_views'] += item['views']
    my_dict_views[item['timestamp']] = existing_item
    
#update dictionary with mobile website data
with open('pageviews_m_web-site_201507-202008.json') as inputfile:
    data = json.load(inputfile)

data = data['items']

for item in data:
    existing_item = my_dict_views[item['timestamp']]
    existing_item['pageview_all_views'] += item['views']
    existing_item['pageview_mobile_views'] += item['views']
    my_dict_views[item['timestamp']] = existing_item



After creating the two dictionaries I combine them and then print it out into the CSV file with the columns being asked for. This is done by looping through each key value pair and writing each row into the file.

In [120]:
#combine two dictionaries
total_dict = {}
for key, value in my_dict_count.items():
    total_dict[key] = {'year':value['year'], 'month':value['month'], 
                                  'pagecount_all_views':value['pagecount_all_views'], 'pagecount_desktop_views':value['pagecount_desktop_views'],
                                  'pagecount_mobile_views':value['pagecount_mobile_views'], 'pageview_all_views':0, 'pageview_desktop_views':0,
                                  'pageview_mobile_views':0}
for key, value in my_dict_views.items():
    if key in total_dict:
        existing_item = total_dict[key]
        existing_item['pageview_all_views'] += value['pageview_all_views']
        existing_item['pageview_desktop_views'] += value['pageview_desktop_views']
        existing_item['pageview_mobile_views'] += value['pageview_mobile_views']
        total_dict[key] = existing_item
    else:
        total_dict[key] = {'year':value['year'], 'month':value['month'],'pagecount_all_views':0, 'pagecount_desktop_views':0,
                           'pagecount_mobile_views':0, 'pageview_all_views':value['pageview_all_views'], 
                           'pageview_desktop_views':value['pageview_desktop_views'], 'pageview_mobile_views':value['pageview_mobile_views']}

#read in total_dict into a CSV file
#print(total_dict)
with open('en-wikipedia_traffic_200712-202008.csv', 'w') as csv_file:  
    writer = csv.writer(csv_file)
    writer.writerow(['year', 'month', 'pagecount_all_views', 'pagecount_desktop_views', 'pagecount_mobile_views', 
                     'pageview_all_views', 'pageview_desktop_views', 'pageview_mobile_views'])
    for key, value in total_dict.items():
       writer.writerow([value['year'], value['month'], value['pagecount_all_views'], value['pagecount_desktop_views'], 
                       value['pagecount_mobile_views'], value['pageview_all_views'], value['pageview_desktop_views'], value['pageview_mobile_views']])

Read in the newly created CSV so that we can plot the graph of the pageviews and pagecounts. This is done by formatting the date column into a useable format for plotly, setting that variable to the x-axis, and plotting the six pagecounts/pageviews columns to the y-axis

In [121]:
#https://plotly.com/python/line-charts/

df = pd.read_csv('en-wikipedia_traffic_200712-202008.csv')
df['date'] = df['year'].astype(str) + "-" + df['month'].astype(str)


#create plot, exclusing values that are non-zero
fig = go.Figure()
fig = fig.add_trace(go.Scatter(x=df['date'][0:103], y=df['pagecount_all_views'][0:103], mode='lines',
                    name='pagecount_all'))
fig = fig.add_trace(go.Scatter(x=df['date'][0:103], y=df['pagecount_desktop_views'][0:103], mode='lines',
                    name='pagecount_desktop'))
fig = fig.add_trace(go.Scatter(x=df['date'][83:103], y=df['pagecount_mobile_views'][83:103], mode='lines',
                    name='pagecount_mobile'))
fig = fig.add_trace(go.Scatter(x=df['date'][90:], y=df['pageview_all_views'][90:], mode='lines',
                    name='pageview_all'))
fig = fig.add_trace(go.Scatter(x=df['date'][90:], y=df['pageview_desktop_views'][90:], mode='lines',
                    name='pageview_desktop'))
fig = fig.add_trace(go.Scatter(x=df['date'][90:], y=df['pageview_mobile_views'][90:], mode='lines',
                    name='pageview_mobile'))

#add labels
fig.update_layout(title='Page Views on Wikipedia',
                   xaxis_title='Year',
                   yaxis_title='Page Views')

fig.show()
