In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')

In [4]:
from src.scrape_wiki import PageviewsClient

In [5]:
import os
import json

Update the cell below such that it includes the keyword files you want to use. The only other thing you need to care about is to set the `language` variable (defined near the end of the notebook). The file patterns in the cell below will be formatted with the language. For instance, if you set `language='en'`, the file `'terrorism_{}.txt'` will be formatted to `'terrorism_en.txt'`. Currently, one cannot run this notebook in one shot to download data in several languages.

In [17]:
path_data = 'data/'
keywords_terrorism = path_data + 'terrorism_{}.txt'

keywords_lists = {
    'terrorism': keywords_terrorism
}

# Introduction

## Prerequisites

Run the script `src/update_keywords.py` to generate lists of keywords, which you define in `keywords_lists` above.

## Scraping rules stated by Wikipedia

> * Limit your clients to no more than 200 requests/s to this API. Each API endpoint's documentation may detail more specific usage limits.
> * Set a unique `User-Agent` or `Api-User-Agent` header that allows us to contact you quickly. Email addresses or URLs of contact pages work well.

In [7]:
contact = 'matthias.zeller@epfl.ch'

## How is scraping performed here

We use existing Python packages:

* [python-mviews](https://github.com/mediawiki-utilities/python-mwviews), it conveniently handles requests to [Wikipedia REST API](https://wikimedia.org/api/rest_v1/). The `pageviews.py` files of the package was copied to `src/scrape_wiki.py`.

* [wikipedia](https://pypi.org/project/wikipedia/): search for articles and debug lowercase / uppercase

# Scraping

In [8]:
# Must provide email address (read the rules above)
p = PageviewsClient(contact)

## Settings

In [9]:
?PageviewsClient.article_views

[0;31mSignature:[0m
[0mPageviewsClient[0m[0;34m.[0m[0marticle_views[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mself[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mproject[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0marticles[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0maccess[0m[0;34m=[0m[0;34m'all-access'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0magent[0m[0;34m=[0m[0;34m'all-agents'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mgranularity[0m[0;34m=[0m[0;34m'daily'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mstart[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mend[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Get pageview counts for one or more articles
See `<https://wikimedia.org/api/rest_v1/metrics/pageviews/?doc\
        #!/Pageviews_data/get_metrics_pageviews_per_article_project\
        _access_agent_article_granularity_start_end>`_
:Parameters:
 

We specifically want to **retrieve pageviews of humans, so we must set `user="agent"`**.

In [10]:
params = {
    'agent': 'user',
    'start': '20150401', # 1st April 2015
    'end':   '20190531'  # 31th May 2019
}

In [11]:
def request(articles, domain='de', **kwargs):
    """Wraps the function PageviewsClient.article_views"""
    wrapped_kwargs = params.copy()
    wrapped_kwargs.update(kwargs)
    domain = domain + '.wikipedia'
    
    # Fetch
    res = p.article_views(articles=articles, project=domain, **wrapped_kwargs)
    
    # Format results in a DataFrame
    res = pd.DataFrame(res).T
    # Replace None -> np.nan
    res = res.applymap(lambda elem: np.nan if elem is None else elem)
    # Sort by dates
    res.sort_index(inplace=True)
    
    return res

## Sample request

In [12]:
sample = request(['Selfie', 'Cat', 'Dog'])
sample

Unnamed: 0,Selfie,Cat,Dog
2015-04-01,,,
2015-04-02,,,
2015-04-03,,,
2015-04-04,,,
2015-04-05,,,
...,...,...,...
2019-05-27,117.0,10.0,9.0
2019-05-28,105.0,17.0,8.0
2019-05-29,85.0,12.0,12.0
2019-05-30,87.0,4.0,2.0


We get some annoying missing values 😑️. Let's see what is happening:

In [13]:
mask_missing = sample.isna().any(axis=1)
sample[mask_missing]

Unnamed: 0,Selfie,Cat,Dog
2015-04-01,,,
2015-04-02,,,
2015-04-03,,,
2015-04-04,,,
2015-04-05,,,
...,...,...,...
2015-06-26,,,
2015-06-27,,,
2015-06-28,,,
2015-06-29,,,


This goes from 1st April 2015 to 30st June 2015. Let's check if changing granularity solves the problem:

In [14]:
monthly = request(['Selfie', 'Cat', 'Dog'], granularity='monthly')
monthly.head()

Unnamed: 0,Selfie,Cat,Dog
2015-04-01,,,
2015-05-01,,,
2015-06-01,,,
2015-07-01,12205.0,439.0,416.0
2015-08-01,9789.0,400.0,434.0


This does not solve the problem 😪️. Before trying to solve the problem, it's an occasion to check if daily and monthly data match:

In [15]:
# Aggregate daily data to monthly sum of views
sample_aggreg = sample.groupby(pd.Grouper(freq='M')).sum()

# The default DateTimeIndex generated when grouping data by months
# contains as days the last day of month (e.g. 31)
# We make those dates match the dates of the downloaded monthly data
sample_aggreg.index = map(lambda date: date.replace(day=1), sample_aggreg.index)

mask_not_na = (~monthly.isna().any(axis=1))

print(f'All elements are equal: {np.all(sample_aggreg[mask_not_na] == monthly[mask_not_na])}')

All elements are equal: True


## Scrape keywords

In [18]:
language = 'en'
# Load keywords lists as defined in keywords_lists (defined on top of this notebook)
keywords = {}
for listname, filename in keywords_lists.items():
    filename = filename.format(language)
    with open(filename, 'r') as f:
        keywords[listname] = f.read().strip('\n').split('\n')

FileNotFoundError: [Errno 2] No such file or directory: 'data/terrorism_en.txt'

In [None]:
keywords

Reminder: we use the custom function `request`:

In [None]:
help(request)

In [None]:
# Redefine params
params = {
    'agent': 'user',
    'start': '20150401', # 1st April 2015
    'end':   '20190531'  # 31th May 2019
}

In [None]:
data = {
    listname: request(kwlist, language)
    for listname, kwlist in keywords.items()
}

In [None]:
def format_dataset(df, language):
    out = pd.DataFrame(df.unstack()).reset_index()
    out.columns = ['article', 'date', 'views']
    out['language'] = language
    return out

In [None]:
data['terrorism']

In [None]:
# Pivot / unstack dataframes
data = {
    listname: format_dataset(df, language)
    for listname, df in data.items()
}

In [None]:
data['terrorism']

In [None]:
data['terrorism'][data['terrorism'].article == 'Al-Qaeda']

In [None]:
# Write data in files
for listname, df in data.items():
    fname = path_data + listname + f'_{language}.csv'
    df.to_csv(fname, index=False)

In [None]:
PageviewsClient.article_views

In [None]:
tmp

In [None]:
tmp = request(['Dog'], agent='user', domain='en', start='20120101', end='20200101')

---

In [22]:
df = pd.read_csv('data/terrorism_en.csv')

In [27]:
df.set_index('date').loc['2015-07-02']

Unnamed: 0_level_0,article,views,language
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-07-02,Al-Qaeda,4722.0,en
2015-07-02,Terrorism,2388.0,en
2015-07-02,Terror,50.0,en
2015-07-02,Attack,61.0,en
2015-07-02,Iraq,4508.0,en
2015-07-02,Afghanistan,5704.0,en
2015-07-02,Iran,8713.0,en
2015-07-02,Pakistan,12438.0,en
2015-07-02,Agro,22.0,en
2015-07-02,Environmental_terrorism,16.0,en
