In [None]:
import pandas as pd
from IPython.core.debugger import set_trace
import datetime
import requests
import time
from tqdm.notebook import tqdm

In [None]:
english_names = pd.read_csv('data/articles.csv', header=None)
english_names.columns = ['Title']
english_names['Title'] = english_names['Title'].apply(lambda x : x.replace(' ', '_'))
english_names

In [None]:
def get_qid_from_title(title, language):
    response = requests.get(f'https://{language}.wikipedia.org/w/api.php?'
                            f'action=query&prop=pageprops&titles={title}&redirects&format=json')
    try:
        r = [item for item in response.json()['query']['pages'].values()][0]
        qid = r['pageprops']['wikibase_item']
    except KeyError:
        print(f'Article {title} has no Wikidata ID')
        return None
    return qid

### Retrieve wikidata entries for each given article

In [None]:
qids = pd.DataFrame([get_qid_from_title(title, 'en') for title in tqdm(english_names.Title.values)])
qids.head()

### English pageview data

In [None]:
def day_year_to_date(year, days):
    return datetime.datetime(year, 1, 1) + datetime.timedelta(days - 1)

def get_wikishark_id(article_name, lang):
    response = requests.get(f'https://www.wikishark.com/autocomplete.php?q={article_name}')
    
    r = response.json()
    
    target = None
    for candidate in r:
        if '(' + lang + ')' in candidate['name'] and article_name.replace('_',' ').lower() in candidate['name'].lower():
            return candidate['id']
    return target

def get_daily_pageviews(titles, language, start, end):
    data = []
    for title in tqdm(titles):
        
        # authors requested we wait 1 second at least in between requests
        time.sleep(1)
        
        wikishark_id = get_wikishark_id(title, language)
        if wikishark_id is None:
            print(f'Could not find data for {title}')
            continue
        # wait to avoid overloading servers
        time.sleep(1)
        
        response = requests.get(f'https://www.wikishark.com/getdata/daily.php?value={wikishark_id}?view=2&scale=0&normalized=0&loglog=0&log=0&zerofix=0')

        daily_data = response.json()
        
        # add data with timestamps
        start_date = datetime.datetime.strptime(start, '%d/%m/%Y')
        end_date   = datetime.datetime.strptime(end, '%d/%m/%Y')
        current_date = datetime.datetime.now()
        
        # wikishark returns daily page views for every day since 2007-12-31 (independent of given parameters)
        # we need to index it according to the time period we are interested in
        start_index = (len(daily_data) - 1) - (current_date - start_date).days
        end_index = (len(daily_data) - 1) - (current_date - end_date).days
        timestamps = {}
        for i, d in enumerate(daily_data[start_index:end_index]):
            ts = start_date + datetime.timedelta(days=i)
            timestamps[ts] = int(d)
    
        data.append({**{'Article': title}, **timestamps})
    
    return pd.DataFrame(data)

In [None]:
import os

PICKLED = './privacy.pkl'

if os.path.isfile(PICKLED):
    privacy_en = pd.read_pickle(PICKLED)
else:
    privacy_en = get_daily_pageviews(list(english_names.Title), 'en', '01/01/2011','01/01/2015')
    # use article name as index
    privacy_en.index = privacy_en.Article
    privacy_en = privacy_en.drop(['Article'], axis=1)
    privacy_en.to_pickle(PICKLED)

### Aggregate into monthly data

In [None]:
privacy_en.columns = pd.to_datetime(privacy_en.columns)

# take monthly cumulative
monthly_en = privacy_en.resample('M', axis=1).sum()
monthly_en.head()

# Exploratory data analysis 

In [None]:
#Melt dataframe to use dates as entry values rather than columns
monthly_en_melt = pd.melt(monthly_en, value_name='views', var_name='date', ignore_index=False)

### Standardizing per article

In [None]:
def standardize(x):
    mean = monthly_en_melt.reset_index().groupby('Article').mean()['views'].loc[x['Article']]
    std  = monthly_en_melt.reset_index().groupby('Article').std()['views'].loc[x['Article']]
    std = std if std != 0 else 1
    return (x['views'] - mean)/std

monthly_en_melt = monthly_en_melt.reset_index()
monthly_en_melt['standardized'] = monthly_en_melt.apply(standardize,axis=1)
monthly_en_melt = monthly_en_melt.set_index('Article')
monthly_en_melt.head()

# Quick preprocessing of articles

### Get all articles that have a very low amount of views no matter what

In [None]:
low_views = [article for article in monthly_en_melt.index.unique() if (monthly_en_melt.loc[article].views < 100).all()]
print(low_views)
monthly_en_melt = monthly_en_melt.drop(low_views)

### Get all articles that have a too high deviation

In [None]:
outliers = [article for article in monthly_en_melt.index.unique() if (monthly_en_melt.loc[article].standardized.abs() > 5).any()]
print(outliers)
monthly_en_melt = monthly_en_melt.drop(outliers)

### Get all articles that were created after the reveal (no views before)

In [None]:
before_date = '2014-01-01'
created_after = [article for article in monthly_en_melt.index.unique() if not (monthly_en_melt.loc[article][['views','date']].set_index('date').loc[:before_date].views > 0).any()]
print(created_after)
monthly_en_melt = monthly_en_melt.drop(created_after)

## Standardized vs total views

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import pyplot, dates
import datetime

reveal = datetime.datetime(2013, 6, 5) #Around june 
fig, axs = plt.subplots(ncols=2, figsize=(10,4))

axs[0].axvline(reveal,c='r')
axs[1].axvline(reveal,c='r')

axs[0].set_title('Standardized')
axs[1].set_title('Views')

monthly_en_melt.groupby('date').sum().reset_index().plot.scatter(x='date', y='standardized', ax=axs[0], rot=90)
monthly_en_melt.groupby('date').sum().reset_index().plot.scatter(x='date', y='views',ax=axs[1],rot=90)

plt.tight_layout()

### Plotting per article views (total)

In [None]:
plt.figure(figsize=(20,5))
monthly_en_melt.reset_index().groupby('Article')['views'].sum().sort_values(ascending=False).plot.bar()

It looks like most of the traffic generated on these pages comes from the 4Chan article, which has been famous for many scandals unrelated to internet privacy, which means also probably a high variance in views. Since we can safely assume this article to not be helpful regarding the tendency of people to look for privacy-enhancing tools, we blacklist this article from our study. Just to be safe : we check the views per months per article.

### Plotting per article views (monthly)

In [None]:
fig, axs = plt.subplots(10,5,figsize=(25,20),sharey=True)
for article,ax in zip(list(monthly_en_melt.index.unique()),axs.flat):
    monthly_en_melt.loc[article][['date','views']].reset_index().plot.scatter(x='date',y='views',ax=ax,rot=90)
    ax.axvline(reveal,c='r')
    ax.set_title(article)
plt.tight_layout()

Indeed, we decide to blacklist 4Chan from our articles. On another note, even at this scale, we see that there seems to be an immediate impact on the pageviews for DuckDuckGo.

### Removing 4chan from our articles

In [None]:
monthly_en_melt = monthly_en_melt.drop('4chan')

### A world without 4Chan

In [None]:
reveal = datetime.datetime(2013, 6, 5) #Around june 
fig, axs = plt.subplots(ncols=2, figsize=(10,4))

axs[0].axvline(reveal,c='r')
axs[1].axvline(reveal,c='r')

axs[0].set_title('Standardized')
axs[1].set_title('Views')

monthly_en_melt.groupby('date').sum().reset_index().plot.scatter(x='date', y='standardized', ax=axs[0], rot=90)
monthly_en_melt.groupby('date').sum().reset_index().plot.scatter(x='date', y='views',ax=axs[1],rot=90)

plt.tight_layout()

In [None]:
plt.figure(figsize=(20,5))
monthly_en_melt.reset_index().groupby('Article')['views'].sum().sort_values(ascending=False).plot.bar()

# Stratification / Discretization

Given that our distribution of total pageviews per articles can be considered as heavy-tailed, we probably don't want equal width or equal frequency discretization. Instead we opt for clustering to divide our articles into multiple groups

In [None]:
from sklearn.cluster import MeanShift, KMeans

sorted_views = monthly_en_melt.reset_index().groupby('Article').sum().sort_values('views',ascending=False)
vals = sorted_views['views'].values.reshape((-1,1))

clustering = KMeans(n_clusters=3).fit(vals)
#clustering = MeanShift(bandwidth=None).fit(test)

plt.scatter(x=range(len(sorted_views)),y=sorted_views['views'], c=clustering.labels_)

sorted_views['group'] = clustering.labels_
monthly_en_melt['group'] = sorted_views['group']

## Plotting per group

In [None]:
import seaborn as sns

reveal = datetime.date(2013,6,5)

for g in monthly_en_melt.group.unique():
    plt.figure()
    ax = plt.gca()
    df = monthly_en_melt[monthly_en_melt['group'] == g].groupby('date').sum()

    before = df.loc[:reveal].reset_index()
    after  = df.loc[reveal:].reset_index()
    before.date = before.date.astype(int)
    after.date = after.date.astype(int)
    
    sns.regplot(x='date',y='views',ax=ax,data=before)
    sns.regplot(x='date',y='views',ax=ax,data=after)
    
    ax.set_title(f'Group {g}')