In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from src.scrape_wiki import PageviewsClient
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')

# Introduction

We use an existing Python library - [python-mviews](https://github.com/mediawiki-utilities/python-mwviews) - that conveniently handles requests to [Wikipedia REST API](https://wikimedia.org/api/rest_v1/). The `pageviews.py` files of the package was copied to `src/scrape_wiki.py`.

## Scraping rules stated by Wikipedia

> * Limit your clients to no more than 200 requests/s to this API. Each API endpoint's documentation may detail more specific usage limits.
> * Set a unique `User-Agent` or `Api-User-Agent` header that allows us to contact you quickly. Email addresses or URLs of contact pages work well.

In [3]:
# Must provide email address (read the rules above)
p = PageviewsClient('matthias.zeller@epfl.ch')

# Keywords

In [4]:
terrorism = [
    'al qaeda',
    'terrorism',
    'terror',
    'attack',
    'iraq',
    'afghanistan',
    'iran',
    'pakistan',
    'agro',
    'environmental terrorism',
    'eco-terrorism',
    'conventional weapon',
    'weapons grade',
    'dirty bomb',
    'nuclear enrichment',
    'nuclear',
    'chemical weapon',
    'ammonium nitrate',
    'improvised explosive device',
    'abu sayyaf',
    'hamas',
    'FARC',
    'irish republican army',
    'euskadi ta askatsuna',
    'tamil tigers',
    'PLO',
    'Palestine',
    'liberation front',
    'car bomb',
    'jihad',
    'taliban'
]

len(terrorism)

31

# Scraping

## Settings

In [5]:
help(PageviewsClient.article_views)

Help on function article_views in module src.scrape_wiki:

article_views(self, project, articles, access='all-access', agent='all-agents', granularity='daily', start=None, end=None)
    Get pageview counts for one or more articles
    See `<https://wikimedia.org/api/rest_v1/metrics/pageviews/?doc\
            #!/Pageviews_data/get_metrics_pageviews_per_article_project\
            _access_agent_article_granularity_start_end>`_
    :Parameters:
        project : str
            a wikimedia project such as en.wikipedia or commons.wikimedia
        articles : list(str) or a simple str if asking for a single article
        access : str
            access method (desktop, mobile-web, mobile-app, or by default, all-access)
        agent : str
            user agent type (spider, user, bot, or by default, all-agents)
        end : str|date
            can be a datetime.date object or string in YYYYMMDD format
            default: today
        start : str|date
            can be a datetime.d

We specifically want to **retrieve pageviews of humans, so we must set `user="agent"`**.

In [6]:
params = {
    'agent': 'user',
    'start': '20150401', # 1st April 2015
    'end':   '20190531'  # 31th May 2019
}

def request(articles, domain='de', **kwargs):
    """Wraps the function PageviewsClient.article_views"""
    wrapped_kwargs = params.copy()
    wrapped_kwargs.update(kwargs)
    domain = domain + '.wikipedia'
    
    # Fetch
    res = p.article_views(articles=articles, project=domain, **wrapped_kwargs)
    
    # Format results in a DataFrame
    res = pd.DataFrame(res).T
    # Replace None -> np.nan
    res = res.applymap(lambda elem: np.nan if elem is None else elem)
    # Sort by dates
    res.sort_index(inplace=True)
    
    return res

## Sample request

In [7]:
sample = request(['Selfie', 'Cat', 'Dog'])
sample

Unnamed: 0,Selfie,Cat,Dog
2015-04-01,,,
2015-04-02,,,
2015-04-03,,,
2015-04-04,,,
2015-04-05,,,
...,...,...,...
2019-05-27,117.0,10.0,9.0
2019-05-28,105.0,17.0,8.0
2019-05-29,85.0,12.0,12.0
2019-05-30,87.0,4.0,2.0


We get some annoying missing values 😑️. Let's see what is happening:

In [8]:
mask_missing = sample.isna().any(axis=1)
sample[mask_missing]

Unnamed: 0,Selfie,Cat,Dog
2015-04-01,,,
2015-04-02,,,
2015-04-03,,,
2015-04-04,,,
2015-04-05,,,
...,...,...,...
2015-06-26,,,
2015-06-27,,,
2015-06-28,,,
2015-06-29,,,


This goes from 1st April 2015 to 30st June 2015. Let's check if changing granularity solves the problem:

In [9]:
monthly = request(['Selfie', 'Cat', 'Dog'], granularity='monthly')
monthly.head()

Unnamed: 0,Selfie,Cat,Dog
2015-04-01,,,
2015-05-01,,,
2015-06-01,,,
2015-07-01,12205.0,439.0,416.0
2015-08-01,9789.0,400.0,434.0


This does not solve the problem 😪️. Before trying to solve the problem, it's an occasion to check if daily and monthly data match:

In [10]:
# Aggregate daily data to monthly sum of views
sample_aggreg = sample.groupby(pd.Grouper(freq='M')).sum()

# The default DateTimeIndex generated when grouping data by months
# contains as days the last day of month (e.g. 31)
# We make those dates match the dates of the downloaded monthly data
sample_aggreg.index = map(lambda date: date.replace(day=1), sample_aggreg.index)

mask_not_na = (~monthly.isna().any(axis=1))

print(f'All elements are equal: {np.all(sample_aggreg[mask_not_na] == monthly[mask_not_na])}')

All elements are equal: True


# Scrape keywords

Reminder: we use the custom function `request`:

In [11]:
help(request)

Help on function request in module __main__:

request(articles, domain='de', **kwargs)
    Wraps the function PageviewsClient.article_views



In [39]:
# Redefine params
params = {
    'agent': 'user',
    'start': '20150401', # 1st April 2015
    'end':   '20190531'  # 31th May 2019
}

In [43]:
p

<src.scrape_wiki.PageviewsClient at 0x7f14448c01f0>

In [42]:
pd.DataFrame(p.article_views('de.wikipedia', 'science', **params)).T

['https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/de.wikipedia/all-access/user/science/daily/2015040100/2019053100']


Unnamed: 0,science
2015-04-01,
2015-04-02,
2015-04-03,
2015-04-04,
2015-04-05,
...,...
2019-05-27,
2019-05-28,
2019-05-29,
2019-05-30,


In [24]:
r = request(terrorism[:4])
r

Unnamed: 0,al_qaeda,terrorism,terror,attack
2015-04-01,,,,
2015-04-02,,,,
2015-04-03,,,,
2015-04-04,,,,
2015-04-05,,,,
...,...,...,...,...
2019-05-27,,,,
2019-05-28,,,,
2019-05-29,,,,
2019-05-30,,,,


In [23]:
r

Unnamed: 0,al_qaeda,terrorism,terror,attack,iraq,afghanistan,iran,pakistan,agro,environmental_terrorism,...,FARC,irish_republican_army,euskadi_ta_askatsuna,tamil_tigers,PLO,Palestine,liberation_front,car_bomb,jihad,taliban
2015-04-01,,,,,,,,,,,...,,,,,,,,,,
2015-04-02,,,,,,,,,,,...,,,,,,,,,,
2015-04-03,,,,,,,,,,,...,,,,,,,,,,
2015-04-04,,,,,,,,,,,...,,,,,,,,,,
2015-04-05,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-05-27,,,,,,,,,,,...,24.0,,,,21.0,9.0,,,,
2019-05-28,,,,,,,,,,,...,26.0,,,,23.0,5.0,,,,
2019-05-29,,,,,,,,,,,...,25.0,,,,20.0,3.0,,,,
2019-05-30,,,,,,,,,,,...,18.0,,,,19.0,4.0,,,,


In [18]:
terrorism

['al qaeda',
 'terrorism',
 'terror',
 'attack',
 'iraq',
 'afghanistan',
 'iran',
 'pakistan',
 'agro',
 'environmental terrorism',
 'eco-terrorism',
 'conventional weapon',
 'weapons grade',
 'dirty bomb',
 'nuclear enrichment',
 'nuclear',
 'chemical weapon',
 'ammonium nitrate',
 'improvised explosive device',
 'abu sayyaf',
 'hamas',
 'FARC',
 'irish republican army',
 'euskadi ta askatsuna',
 'tamil tigers',
 'PLO',
 'Palestine',
 'liberation front',
 'car bomb',
 'jihad',
 'taliban']

---

---

In [13]:
d = p.article_views('de.wikipedia', ['Selfie', 'Cat', 'Dog'], start=params['start'], end=params['end'])

In [14]:
d = p.article_views('en.wikipedia', ['Selfie', 'Cat', 'Dog'])

In [15]:
pd.DataFrame(d)

Unnamed: 0,2020-10-29,2020-10-30,2020-10-31,2020-11-01,2020-11-02,2020-11-03,2020-11-04,2020-11-05,2020-11-06,2020-11-07,...,2020-11-19,2020-11-20,2020-11-21,2020-11-22,2020-11-23,2020-11-24,2020-11-25,2020-11-26,2020-11-27,2020-11-28
Selfie,609,664,500,582,560,602,528,541,590,491,...,752,1087,791,826,718,606,542,599,,
Cat,9856,9193,8353,8235,9462,9668,9521,9675,9409,8435,...,10706,12380,13987,11882,10949,14860,14134,15299,,
Dog,11128,11380,8671,8906,11185,11591,10910,12308,12027,9329,...,13397,13872,11761,54753,11662,12407,10792,10941,,


**TODO: domain, commons.wikimedia.org**

https://wikitech.wikimedia.org/wiki/Analytics/Data_Lake/Traffic/Pageview_hourly


The problem is, this is not possible to query pageview per country of specific articles with REST API https://wikimedia.org/api/rest_v1/#/Pageviews_data/get_metrics_pageviews

This is the private data https://wikitech.wikimedia.org/wiki/Analytics/Data_Lake/Traffic/Pageview_hourly and contains the countries... but it is private

This is the public data https://wikitech.wikimedia.org/wiki/Analytics/Data_Lake/Traffic/Pageviews



In [16]:
url = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/top/en.wikipedia/all-access/2016/02/29'

In [17]:
req = requests.get(url)
req

NameError: name 'requests' is not defined

In [None]:
data = req.json()
data.keys()

In [None]:
data = data['items']
type(data), len(data)

In [None]:
data = data[0]
data.keys()

In [None]:
# Type of the values in dictionnary data
list(map(type, (v for _, v in data.items())))

In [None]:
data['articles'][:10]