In [None]:
import pandas as pd
from IPython.core.debugger import set_trace
import datetime
import requests
from tqdm.notebook import tqdm

In [None]:
english_names = pd.read_csv('data/articles.csv', header=None)
english_names.columns = ['Title']
english_names['Title'] = english_names['Title'].apply(lambda x : x.replace(' ', '_'))
english_names

In [None]:
def get_qid_from_title(title, language):
    response = requests.get(f'https://{language}.wikipedia.org/w/api.php?'
                            f'action=query&prop=pageprops&titles={title}&redirects&format=json')
    try:
        r = [item for item in response.json()['query']['pages'].values()][0]
        qid = r['pageprops']['wikibase_item']
    except KeyError:
        print(f'Article {title} has no Wikidata ID')
        return None
    return qid

### Retrieve wikidata entries for each given article

In [None]:
qids = pd.DataFrame([get_qid_from_title(title, 'en') for title in tqdm(english_names.Title.values)])
qids.head()

### English pageview data

In [None]:
def day_year_to_date(year, days):
    return datetime.datetime(year, 1, 1) + datetime.timedelta(days - 1)

def get_daily_pageviews(titles, language, years):
    data = []
    for title in tqdm(titles):
        response = requests.get(f'http://petermeissner.de:8880/article/exact/{language}/{title}')
        if response.json()['status'] == 'ok':
            article_data = response.json()['data']
            time_data = {'Article': title}
            for yearly_data in article_data:
                if yearly_data['year'] in years:  
                    view_counts = yearly_data['page_view_count'].split(',')
                    new_entries = {day_year_to_date(int(yearly_data['year']), i+1): int(c) 
                                   for i, c in enumerate(view_counts)}
                    time_data = {**time_data, 
                                 **new_entries}
        else:
            print(f"Could not retrieve data for {title}")
                
        data.append(time_data)
    
    return pd.DataFrame(data)

In [None]:
privacy_en = get_daily_pageviews(list(english_names.head(30).Title), 'en', [2011, 2012, 2013,2014])

# use article name as index
privacy_en.index = privacy_en.Article
privacy_en = privacy_en.drop(['Article'], axis=1)
privacy_en.head()

### Aggregate into monthly data

In [None]:
privacy_en.columns = pd.to_datetime(privacy_en.columns)

# take monthly cumulative
monthly_en = privacy_en.resample('M', axis=1).sum()
monthly_en.head()

# Exploratory data analysis 

In [None]:
#Melt dataframe to use dates as entry values rather than columns
monthly_en_melt = pd.melt(monthly_en, value_name='views', var_name='date', ignore_index=False)

### Plotting the monthly total pageviews

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import pyplot, dates
import datetime

reveal = datetime.datetime(2013, 6, 5) #Around june 
fig, ax = plt.subplots()
ax.axvline(reveal,c='r')
monthly_en_melt.reset_index().groupby('date').sum().reset_index().plot.scatter(x='date',y='views',ax=ax,rot=90)

### Plotting per article views (total)

In [None]:
monthly_en_melt.reset_index().groupby('Article')['views'].sum().sort_values(ascending=False).plot.bar()

It looks like most of the traffic generated on these pages comes from the 4Chan article, which has been famous for many scandals unrelated to internet privacy, which means also probably a high variance in views. Since we can safely assume this article to not be helpful regarding the tendency of people to look for privacy-enhancing tools, we blacklist this article from our study. Just to be safe : we check the views per months per article.

### Plotting per article views (monthly)

In [None]:
fig, axs = plt.subplots(6,5,figsize=(20,20),sharey=True)
for article,ax in zip(list(monthly_en_melt.index.unique()),axs.flat):
    monthly_en_melt.loc[article][['date','views']].reset_index().plot.scatter(x='date',y='views',ax=ax,rot=90)
    ax.axvline(reveal,c='r')
    ax.set_title(article)
plt.tight_layout()

Indeed, we decide to blacklist 4Chan from our articles. On another note, even at this scale, we see that there seems to be an immediate impact on the pageviews for DuckDuckGo.

### Removing 4chan from our articles

In [None]:
monthly_en_melt = monthly_en_melt.drop('4chan')

### A world without 4Chan

In [None]:
fig, ax = plt.subplots()
ax.axvline(reveal,c='r')
monthly_en_melt.reset_index().groupby('date').sum().reset_index().plot.scatter(x='date',y='views',ax=ax,rot=90)

In [None]:
fig, axs = plt.subplots(6,5,figsize=(20,20),sharey=True)
for article,ax in zip(list(monthly_en_melt.index.unique()),axs.flat):
    monthly_en_melt.loc[article][['date','views']].reset_index().plot.scatter(x='date',y='views',ax=ax,rot=90)
    ax.axvline(reveal,c='r')
    ax.set_title(article)
plt.tight_layout()

In [None]:
monthly_en_melt.reset_index().groupby('Article')['views'].sum().sort_values(ascending=False).plot.bar()