In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')

In [61]:
from src.scrape_wiki import PageviewsClient

In [119]:
import os
import json

In [162]:
keywords_terrorism = 'terrorism.txt'

keywords_lists = {
    'terrorism': keywords_terrorism
}

# Introduction

## Prerequisites

Run the script `src/update_keywords.py` to generate a list of keywords.

## Scraping rules stated by Wikipedia

> * Limit your clients to no more than 200 requests/s to this API. Each API endpoint's documentation may detail more specific usage limits.
> * Set a unique `User-Agent` or `Api-User-Agent` header that allows us to contact you quickly. Email addresses or URLs of contact pages work well.

In [65]:
contact = 'matthias.zeller@epfl.ch'

## How is scraping performed here

We use existing Python packages:

* [python-mviews](https://github.com/mediawiki-utilities/python-mwviews), it conveniently handles requests to [Wikipedia REST API](https://wikimedia.org/api/rest_v1/). The `pageviews.py` files of the package was copied to `src/scrape_wiki.py`.

* [wikipedia](https://pypi.org/project/wikipedia/): search for articles and debug lowercase / uppercase

# Scraping

In [3]:
# Must provide email address (read the rules above)
p = PageviewsClient(contact)

## Settings

In [68]:
?PageviewsClient.article_views

[0;31mSignature:[0m
[0mPageviewsClient[0m[0;34m.[0m[0marticle_views[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mself[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mproject[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0marticles[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0maccess[0m[0;34m=[0m[0;34m'all-access'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0magent[0m[0;34m=[0m[0;34m'all-agents'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mgranularity[0m[0;34m=[0m[0;34m'daily'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mstart[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mend[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Get pageview counts for one or more articles
See `<https://wikimedia.org/api/rest_v1/metrics/pageviews/?doc\
        #!/Pageviews_data/get_metrics_pageviews_per_article_project\
        _access_agent_article_granularity_start_end>`_
:Parameters:
 

We specifically want to **retrieve pageviews of humans, so we must set `user="agent"`**.

In [6]:
params = {
    'agent': 'user',
    'start': '20150401', # 1st April 2015
    'end':   '20190531'  # 31th May 2019
}

In [6]:
def request(articles, domain='de', **kwargs):
    """Wraps the function PageviewsClient.article_views"""
    wrapped_kwargs = params.copy()
    wrapped_kwargs.update(kwargs)
    domain = domain + '.wikipedia'
    
    # Fetch
    res = p.article_views(articles=articles, project=domain, **wrapped_kwargs)
    
    # Format results in a DataFrame
    res = pd.DataFrame(res).T
    # Replace None -> np.nan
    res = res.applymap(lambda elem: np.nan if elem is None else elem)
    # Sort by dates
    res.sort_index(inplace=True)
    
    return res

## Sample request

In [160]:
sample = request(['Selfie', 'Cat', 'Dog'])
sample

['https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/de.wikipedia/all-access/user/Selfie/daily/2015040100/2019053100', 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/de.wikipedia/all-access/user/Cat/daily/2015040100/2019053100', 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/de.wikipedia/all-access/user/Dog/daily/2015040100/2019053100']


Unnamed: 0,Selfie,Cat,Dog
2015-04-01,,,
2015-04-02,,,
2015-04-03,,,
2015-04-04,,,
2015-04-05,,,
...,...,...,...
2019-05-27,117.0,10.0,9.0
2019-05-28,105.0,17.0,8.0
2019-05-29,85.0,12.0,12.0
2019-05-30,87.0,4.0,2.0


We get some annoying missing values 😑️. Let's see what is happening:

In [8]:
mask_missing = sample.isna().any(axis=1)
sample[mask_missing]

Unnamed: 0,Selfie,Cat,Dog
2015-04-01,,,
2015-04-02,,,
2015-04-03,,,
2015-04-04,,,
2015-04-05,,,
...,...,...,...
2015-06-26,,,
2015-06-27,,,
2015-06-28,,,
2015-06-29,,,


This goes from 1st April 2015 to 30st June 2015. Let's check if changing granularity solves the problem:

In [45]:
monthly = request(['Selfie', 'Cat', 'Dog'], granularity='monthly')
monthly.head()

['https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/de.wikipedia/all-access/user/Selfie/monthly/2015040100/2019053100', 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/de.wikipedia/all-access/user/Cat/monthly/2015040100/2019053100', 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/de.wikipedia/all-access/user/Dog/monthly/2015040100/2019053100']


Unnamed: 0,Selfie,Cat,Dog
2015-04-01,,,
2015-05-01,,,
2015-06-01,,,
2015-07-01,12205.0,439.0,416.0
2015-08-01,9789.0,400.0,434.0


This does not solve the problem 😪️. Before trying to solve the problem, it's an occasion to check if daily and monthly data match:

In [10]:
# Aggregate daily data to monthly sum of views
sample_aggreg = sample.groupby(pd.Grouper(freq='M')).sum()

# The default DateTimeIndex generated when grouping data by months
# contains as days the last day of month (e.g. 31)
# We make those dates match the dates of the downloaded monthly data
sample_aggreg.index = map(lambda date: date.replace(day=1), sample_aggreg.index)

mask_not_na = (~monthly.isna().any(axis=1))

print(f'All elements are equal: {np.all(sample_aggreg[mask_not_na] == monthly[mask_not_na])}')

All elements are equal: True


## Scrape keywords

In [163]:
# Load keywords lists as defined in keywords_lists (defined on top of this notebook)
keywords = {}
for listname, filename in keywords_lists.items():
    with open(filename, 'r') as f:
        keywords[listname] = f.read().strip('\n').split('\n')

In [164]:
keywords

{'terrorism': ['Al-Qaeda',
  'Terrorism',
  'Terror',
  'Attack',
  'Iraq',
  'Afghanistan',
  'Iran',
  'Pakistan',
  'Agro',
  'Environmental terrorism',
  'Eco-terrorism',
  'Conventional weapon',
  'Weapons-grade nuclear material',
  'Dirty bomb',
  'Enriched uranium',
  'Nuclear',
  'Chemical weapon',
  'Biological agent',
  'Ammonium nitrate',
  'Improvised explosive device',
  'Abu Sayyaf',
  'Hamas',
  'Revolutionary Armed Forces of Colombia',
  'Irish Republican Army',
  'ETA (separatist group)',
  'Hezbollah',
  'Liberation Tigers of Tamil Eelam',
  'Palestine Liberation Organization',
  'Popular Front for the Liberation of Palestine',
  'Car bomb',
  'Jihad',
  'Taliban',
  'Suicide attack',
  'Suicide attack',
  'Al-Qaeda in the Arabian Peninsula',
  'Al-Qaeda in the Islamic Maghreb',
  'Tehrik-i-Taliban Pakistan',
  'Yemen',
  'Piracy',
  'Extremism',
  'Somalia',
  'Nigeria',
  'Radical politics',
  'Al-Shabaab (militant group)',
  'Nationalism',
  'Recruitment',
  'Funda

Reminder: we use the custom function `request`:

In [11]:
help(request)

Help on function request in module __main__:

request(articles, domain='de', **kwargs)
    Wraps the function PageviewsClient.article_views



In [39]:
# Redefine params
params = {
    'agent': 'user',
    'start': '20150401', # 1st April 2015
    'end':   '20190531'  # 31th May 2019
}

In [165]:
data = {
    listname: request(kwlist, 'en')
    for listname, kwlist in keywords.items()
}

['https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/user/Al-Qaeda/daily/2015040100/2019053100', 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/user/Terrorism/daily/2015040100/2019053100', 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/user/Terror/daily/2015040100/2019053100', 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/user/Attack/daily/2015040100/2019053100', 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/user/Iraq/daily/2015040100/2019053100', 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/user/Afghanistan/daily/2015040100/2019053100', 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/user/Iran/daily/2015040100/2019053100', 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedi

In [170]:
terrorism = data['terrorism']
terrorism

Unnamed: 0,Al-Qaeda,Terrorism,Terror,Attack,Iraq,Afghanistan,Iran,Pakistan,Agro,Environmental_terrorism,...,Piracy,Extremism,Somalia,Nigeria,Radical_politics,Al-Shabaab_(militant_group),Nationalism,Recruitment,Fundamentalism,Islamism
2015-04-01,,,,,,,,,,,...,,,,,,,,,,
2015-04-02,,,,,,,,,,,...,,,,,,,,,,
2015-04-03,,,,,,,,,,,...,,,,,,,,,,
2015-04-04,,,,,,,,,,,...,,,,,,,,,,
2015-04-05,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-05-27,3713.0,2696.0,82.0,35.0,4617.0,7570.0,10199.0,11506.0,8.0,39.0,...,2052.0,349.0,5218.0,7835.0,1.0,803.0,3031.0,1007.0,573.0,635.0
2019-05-28,3913.0,2708.0,94.0,29.0,5485.0,8296.0,11980.0,15035.0,7.0,28.0,...,2342.0,381.0,4366.0,7778.0,2.0,786.0,2822.0,1073.0,598.0,589.0
2019-05-29,3686.0,2781.0,111.0,45.0,4186.0,6906.0,10257.0,11795.0,8.0,30.0,...,2095.0,392.0,3638.0,8407.0,3.0,737.0,2310.0,1081.0,571.0,602.0
2019-05-30,3797.0,2689.0,96.0,36.0,4158.0,7074.0,9196.0,11486.0,8.0,30.0,...,1996.0,365.0,3487.0,8481.0,1.0,658.0,2406.0,1051.0,630.0,641.0


In [173]:
terrorism.isna().any(axis=1)

2015-04-01     True
2015-04-02     True
2015-04-03     True
2015-04-04     True
2015-04-05     True
              ...  
2019-05-27    False
2019-05-28    False
2019-05-29    False
2019-05-30    False
2019-05-31    False
Length: 1522, dtype: bool

In [174]:
terrorism[terrorism.isna().all(axis=1)]

Unnamed: 0,Al-Qaeda,Terrorism,Terror,Attack,Iraq,Afghanistan,Iran,Pakistan,Agro,Environmental_terrorism,...,Piracy,Extremism,Somalia,Nigeria,Radical_politics,Al-Shabaab_(militant_group),Nationalism,Recruitment,Fundamentalism,Islamism
2015-04-01,,,,,,,,,,,...,,,,,,,,,,
2015-04-02,,,,,,,,,,,...,,,,,,,,,,
2015-04-03,,,,,,,,,,,...,,,,,,,,,,
2015-04-04,,,,,,,,,,,...,,,,,,,,,,
2015-04-05,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-06-26,,,,,,,,,,,...,,,,,,,,,,
2015-06-27,,,,,,,,,,,...,,,,,,,,,,
2015-06-28,,,,,,,,,,,...,,,,,,,,,,
2015-06-29,,,,,,,,,,,...,,,,,,,,,,
