In [None]:
import pandas as pd
import numpy as np
import datetime
import requests
import seaborn as sns
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import datetime
import time
import requests
import time
from typing import List
import matplotlib.ticker as mticker
import matplotlib.dates as mdates
from tqdm.notebook import tqdm
from sklearn.cluster import MeanShift, KMeans
from IPython.core.debugger import set_trace
from matplotlib import pyplot, dates

# Loading Data

We need data to analyze. From our milestone we can infer that we will need to be able to get pageviews from both specific articles on Wikipedia, as well as the aggregated pageviews for an entire language project.

### Load our generated list of articles
For our analysis we have aggregated a set of articles representing "Internet software privacy" that we need to analyze. We start out by loading that list of articles:

In [None]:
# load article titles
english_names = pd.read_csv('data/articles.csv', header=None, names=['Title'])

# transform article names so that we can easily query them in the APIs
english_names['Title'] = english_names['Title'].apply(lambda x : x.replace(' ', '_'))
english_names.head()

### Retrieve wikidata entries for each given article
Now we proceed by getting the wikidata IDs for each article in our list. We need these so that we can retrieve the equivalent articles in other language projects of Wikipedia.

In [None]:
def get_qid_from_title(title: str, language: str) -> str:
    """
    Gets the Wikidata ID for the given article on the given language project.
    
    :param title: the article title to retrieve qid for
    :param language: the string representing the language project the article title comes from
    
    :returns: the Wikidata ID for the article
    """
    response = requests.get(f'https://{language}.wikipedia.org/w/api.php?'
                            f'action=query&prop=pageprops&titles={title}&redirects&format=json')
    try:
        r = [item for item in response.json()['query']['pages'].values()][0]
        qid = r['pageprops']['wikibase_item']
    except KeyError:
        print(f'Article {title} has no Wikidata ID')
        return None
    return qid

In [None]:
# retrieve Wikidata IDs
qids = pd.DataFrame([get_qid_from_title(title, 'en') for title in tqdm(english_names.Title.values)])
qids.head()

### Retrieve English pageview data 
We now utilize https://www.wikishark.com/ to retrieve the actual page view data for each of the articles on the English language Wikipedia project. We have contacted the developer of this website to grant us the permission to call their backends that we found by doing some 1337 hacking. We are allowed to do so for the "Internet privacy software"-related articles, if we wait 1-2 seconds in between each request.

In [None]:
def day_year_to_date(year: int, days: int) -> datetime.datetime:
    """
    Takes the given numerical year and days passed of that year 
    and returns the datetime object representing that date.
    
    :param year: the christian calendar year representing the date wanted
    :param days: the amount of days passed in that year
    
    :returns: datetime object corresponding to the specified date
    """
    return datetime.datetime(year, 1, 1) + datetime.timedelta(days - 1)

def get_wikishark_id(article_name: str, language: str) -> str:
    """
    Gives the wikishark internal ID for the given article and language project.
    
    :param article_name: the article title for which the internal ID is requested.
    :param language: the string representing the language project the article title comes from
    
    :returns: string corresponding to the retrieved wikishark ID for the title
    """
    response = requests.get(f'https://www.wikishark.com/autocomplete.php?q={article_name}')
    r = response.json()
    
    target = None
    for candidate in r:
        if '(' + language + ')' in candidate['name'] and article_name.replace('_',' ').lower() in candidate['name'].lower():
            return candidate['id']
    return target

def get_daily_pageviews(titles: List[str], language: str, start: str, end: str) -> pd.DataFrame:
    """
    Retrieves the daily page views for the given articles on the given language edition of Wikipedia.
    Filtering is also performed to only return the page views for the given time span.
    
    :param titles: the list of articles to retrieve page views for
    :param language: the string representing the language project the article titlse come from
    
    :returns: DataFrame containing the daily pageviews for the articles in the time span specified
    """
    data = []
    for title in tqdm(titles):
        
        # developer requested we wait 1 second at least in between requests
        time.sleep(1)
        
        # retrieve wikishark ID for article
        wikishark_id = get_wikishark_id(title, language)
        if wikishark_id is None:
            print(f'Could not find data for {title}')
            continue
        
        # wait to avoid overloading servers
        time.sleep(1)
        
        # retrieve page views for given title
        response = requests.get(f'https://www.wikishark.com/getdata/daily.php?value={wikishark_id}?view=2&scale=0&normalized=0&loglog=0&log=0&zerofix=0')
        daily_data = response.json()
        
        # add data with timestamps
        start_date = datetime.datetime.strptime(start, '%d/%m/%Y')
        end_date   = datetime.datetime.strptime(end, '%d/%m/%Y')
        current_date = datetime.datetime.now()
        
        # wikishark returns daily page views for every day since 2007-12-31 (independent of given parameters)
        # we need to index it according to the time period we are interested in
        start_index = (len(daily_data) - 1) - (current_date - start_date).days
        end_index = (len(daily_data) - 1) - (current_date - end_date).days
        
        # add page views for each day
        timestamps = {}
        for i, d in enumerate(daily_data[start_index:end_index]):
            ts = start_date + datetime.timedelta(days=i)
            timestamps[ts] = int(d)
    
        # add data for given article to collection
        data.append({**{'Article': title}, **timestamps})
    
    return pd.DataFrame(data)

For this next cell we have made it possible to both regenerate the data set yourself, or just load from the pickled object. Given that we have to wait 1 second in between requests to the wikishark API: we would suggest loading the pickled object.

In [None]:
import os

PICKLED = './privacy.pkl'

# load pickled data if available in root of directory
if os.path.isfile(PICKLED):
    privacy_en = pd.read_pickle(PICKLED)
    
# reconstruct data from scratch
else:
    privacy_en = get_daily_pageviews(list(english_names.Title), 'en', '01/01/2011','01/01/2016')
    
    # use article name as index
    privacy_en.index = privacy_en.Article
    privacy_en = privacy_en.drop(['Article'], axis=1)
    
    # pickle data to avoid having to re-request it
    privacy_en.to_pickle(PICKLED)

### Aggregate into monthly data
Our data from above comes in a daily resolution. Our analysis will be on a monthly basis so we need to aggregate it:

In [None]:
# make columns datetime objects for resampling to work
privacy_en.columns = pd.to_datetime(privacy_en.columns)

# take monthly cumulative
monthly_en = privacy_en.resample('M', axis=1).sum()
monthly_en.head()

# Exploratory data analysis 
We now perform an exploratory data analysis to identify any issues with the data we've generated so far. This is different from the approach taken in the original paper, where issues with outliers were identified and corrected for in another iteration of the analysis.

We start by melting our data so that it is easier to work with:

In [None]:
# melt dataframe to use dates as entry values rather than columns
monthly_en_melt = pd.melt(monthly_en, value_name='views', var_name='date', ignore_index=False)
monthly_en_melt.head()

Let's have a quick look at the summed data real quick:

In [None]:
monthly_en_melt.reset_index().groupby('date').sum().reset_index().plot.scatter(x='date',y='views')

# Quick preprocessing of articles

Since we have a lot of articles, we want to first remove articles that are of no interest to us, before we can visualize the total dataset correctly

### Get all articles that have a very low amount of views no matter what

In [None]:
low_views = [article for article in monthly_en_melt.index.unique() if (monthly_en_melt.loc[article].views < 100).all()]
print(low_views)
monthly_en_melt = monthly_en_melt.drop(low_views)

### Get all articles that may be outliers

In the same way the original paper found outlier like the hamas article, we want to remove from our dataset sudden changes in pageviews that are not part of our signal.
Since the views an article gets monthly doesn't follow a gaussian distribution, we use IQRs to determine if an article is an outlier rather than just using the 3 standard deviations rule. If the views in a month are higher than the 75th percentile + 3 times the interquartile range, we consider it for manual inspection.

In [None]:
def _is_outlier(article, factor=2):
    """
    Computes if the given entry is an outlier article using IQR in our local dataframe.
    """
    q25, q75 = np.percentile(monthly_en_melt.loc[article].views, 25), np.percentile(monthly_en_melt.loc[article].views, 75)
    iqr = q75 - q25
    cut_off = iqr * factor
    lower, upper = q25 - cut_off, q75 + cut_off
    return ((monthly_en_melt.loc[article].views < lower) | (monthly_en_melt.loc[article].views > upper)).any()

outliers = [article for article in monthly_en_melt.index.unique() if _is_outlier(article)]
print(f'Number of outliers identified: {len(outliers)}')
print(f'Titles: {outliers}')

## Manual inspection

In order to verify that our potential outliers should be removed, we manually check each of these articles, the news related to it, to find out if there is an obvious reason for the sudden change in views, and if this reason is related to our signal: privacy enhancing technologies.

The articles we suspect as outliers are:
- **I2P**: Spike happens around the time Silkroad (drug network) moves from tor to I2p
- **IMule**: High variance all around, low views, bump in 2012 probably related to the bump in use of i2p in 2012
- **Operation onymous**: Spikes around the time of operation, an international law enforcement operation targeting darknet markets and other hidden services operating on the Tor network.
- **Tor**: Spike happened when SilkRoad was shut down in october 2013, spike in views was quite large and somewhat unrelated.
- **Bitblinder**: Spike happens in  March 2011 when researchers documented an attack that is capable of revealing the IP addresses of BitTorrent users on the Tor network. 
- **2channel** and **4chan**: Spikes happen in Sep 2015 when the founder of 4chan,  Christopher Poole, formally announced on 21 September 2015 that he had sold the website to the founder of 2channel, 	Hiroyuki Nishimura. 
- **Bitmessage**: Spike happened in Sep 2014 when there were tons of messages being delayed or missing, and also there was word that the network is being attacked by spam. The next update was released then in October 2014 to address these problems.
- **Confide**: The pageviews we got from wikishark don't correspond to the article about the Confide app, but another article with the same name (disambiguation is not working very well on wikishark).

## Outlier removal
We remove the aforementioned articles from our analysis:

In [None]:
manual_outliers = ['4chan','I2P', 'IMule', 'Operation_Onymous', 'Tor_(anonymity_network)', '2channel', 'Bitmessage', 'Confide']
monthly_en_melt= monthly_en_melt.drop(manual_outliers)

# Finally some data visualization
We produce a set of plots to get more of a feel of our data.

### Plotting per article views (total)
This will let us see the page view contribution of each article to the overall signal we are analyzing:

In [None]:
# sort articles by total views in time period
most_views = monthly_en_melt.reset_index().groupby('Article')['views'].sum().sort_values(ascending=False)

# keep the index for the five articles with the most views
head = list(most_views.head(5).index) # a surprise tool that will help us later

# plot the cumulative views for the articles
plt.figure(figsize=(20,5))
most_views.plot.bar()
plt.title("Total pageviews per article")
plt.ylabel("Total pageviews")
plt.show()

Now that we have removed all those outlier articles, we have a look at what is the real distribution of pageviews for articles that should represent privacy enhancing techniques. We remark that there is still a high gap in the pageviews, for example, the "Pretty Good Privacy" article represents a good percentage of the mass of the distribution.

It might look like this tail is quite long, but this is actually because there is a decent amount of articles that were created during the time period of our analysis, and thus don't accumulate as much total views.

## Plotting total and head 5 articles pageviews
To further investigate just how much the top five articles make up of the signal, we create a plot comparing these two:

In [None]:
# date of reveal
reveal = datetime.datetime(2013, 6, 3)
fig, ax = plt.subplots(sharey=True, figsize=(10,5))

# plot cumulative page views for all the articles in our analysis
monthly_en_melt.reset_index().groupby('date').sum().reset_index().plot.scatter(x='date',y='views',ax=ax,label='All articles')

# plot cumulative page views for the top five contributors of our signal
monthly_en_melt.loc[head].reset_index().groupby('date').sum().reset_index().plot.scatter(x='date',y='views',ax=ax,c='r',label='Top 5 articles')

# mark the date of the reveal
plt.axvline(reveal,c='r')
plt.title("Total pageviews for privacy-enhancing technology articles")
plt.show()

There seems to be a lot of variance even before the reveal date, with a peak at the reveal date rather than an effect due to the reveal. We see that the top 5 articles have a very high effect on the trend.

Something interesting to note is that in the long term, there seems to be a reversing trend : the pageviews tend to go down until 2016.

## Plotting the pageviews of articles with the most weight

Since the first 5 articles are responsible for a good portion of the trend of the data, we investigate further on these.

In [None]:
fig, axs = plt.subplots(ncols=5, figsize=(30,5), sharey=True)
for i,ax in enumerate(axs.flatten()):
    monthly_en_melt.loc[head[i]].plot.scatter(x='date',y='views',ax=ax)
    ax.axvline(reveal,c='r')
    ax.set_title(head[i])
plt.suptitle('Article pageviews for top 5 articles (with respect to total views)')
plt.show()

For the DuckDuckGo article, we are able to see quite an important direct impact on the pageviews, starting from the reveal, however, long term it doesn't seem to affect the trend.

For the PGP article, there might be a little bit of an impact, but since there was already a trend of increase it is hard to say.

For the SOCKS, PeerGuardian and Freegate articles we can't identify any lasting trends in views from these visualizations which could be attributed to the reveal.

# Segmented Regression Analysis
Alright, it is time for us to get into the meat and bones of this analysis. We perform a segmented regression analysis on our subset of articles, just like in the original paper we were assigned:

In [None]:
fig,ax = plt.subplots(figsize=(10,5))
df = monthly_en_melt.groupby('date').sum()

# times before and after the reveal
before = df.loc[:reveal].reset_index()
after  = df.loc[reveal:].reset_index()

# set dates to num so regplot is able to produce a result
before.date = mdates.date2num(before.date)
after.date  = mdates.date2num(after.date)

# regplot plots both scatter, and regression fit
sns.regplot(x='date', y='views', ax=ax, data=before, label='before')
sns.regplot(x='date', y='views', ax=ax, data=after, label='after')

# mark reveal
plt.axvline(reveal, c='r')
plt.legend()
plt.title("Total pageviews for privacy-enhancing technology articles")

# turn numbers back to dates on the axis
loc = mdates.AutoDateLocator()
ax.xaxis.set_major_locator(loc)
ax.xaxis.set_major_formatter(mdates.AutoDateFormatter(loc))

We remark a sudden impact in the following few months, but the long term trend seems to be only slightly impacted by the reveal. It even looks like the usual trend of growth becomes more stable starting from the reveal. It is interesting to compare this to the global wikipedia trend to see how it compares

# Statsmodel Regression Fit

We perform regressional analysis and interpret the coefficients of our model here. We make use of the following model in our analysis (like in the original paper):

$$
Y_t = \beta_0 + \beta_1 \text{time} + \beta_2 \text{intervention} + \beta_3 \text{postslope} + \epsilon
$$

We start out by preparing a dataframe to have the features of the above-mentioned model:

In [None]:
# dataframe containing features required for regression analysis
views = pd.concat([before, after]).views
monthly_views = pd.DataFrame([{'views': views.iloc[i], 'month': i} for i in range(len(views))])

# event occurred during the middle of this 4 year time period
event = 24

# provide feature indicating whether event had occurred yet
monthly_views.loc[:, 'intervention'] = 1
monthly_views.loc[:event, 'intervention'] = 0

# provide feature indicating time-delta from event
monthly_views.loc[:, 'postslope'] = np.abs(monthly_views.month.values - (event))
monthly_views.loc[:event, 'postslope'] = 0
monthly_views.head()

In [None]:
# perform regression analysis
reg = smf.ols('views ~ month + C(intervention) + postslope', data=monthly_views)
res = reg.fit()
print(res.summary())

# Comparison with the global wikipedia pageviews
We will now compare our results with global trends on wikipedia to determine whether there is a difference in trends compared to the global trends seen on wikipedia.

## Segmented Regression analysis
We perform the same analysis as above, using the global page view data. To do so we start out by extracting the global page view data for the English language project. To do so we use the wikimedia API providing these statistics:

In [None]:
def get_pagecounts(language: str, start: str, end: str) -> pd.DataFrame:
    """
    Retrieve the monthly page view aggregates for the given wikipedia language project in the 
    specified time span.
    
    :param language: the string representing the language project the article titlse come from
    :param start: the string representing the starting date of the requested time span
    :param end: the string representing the ending date of the requested time span
    
    :returns: the resulting page views for the given time span
    """
    r = requests.get(f'https://wikimedia.org/api/rest_v1/metrics/legacy/pagecounts/aggregate/{language}.wikipedia/all-sites/monthly/{start}/{end}')
    data = r.json()['items']
    
    # process each month separately
    pagecounts = []
    for month in data:
        
        # add datetime object for ease-of-use
        ts = datetime.datetime.strptime(month['timestamp'], '%Y%m%d%H')
        pagecounts.append({'Timestamp': ts, 'Page views': month['count']})
        
    pagecounts = pd.DataFrame(pagecounts).sort_values(by='Timestamp')
    return pagecounts

monthly_en_all = get_pagecounts('en', '2011063000', '2015063000')
monthly_en_all.head()

Now we shall create the same plot as above:

In [None]:
event = datetime.datetime(2013, 6, 3)

# use mdates for plotting using seaborn
monthly_en_all.Timestamp = mdates.date2num(monthly_en_all.Timestamp)

# separate groups for before and after regression lines
monthly_en_all['Intervention'] = 'Before'
monthly_en_all.loc[monthly_en_all.Timestamp > mdates.date2num(event), 'Intervention'] = 'After'

# plot regression lines for both time periods
g = sns.lmplot(x='Timestamp', y='Page views', hue='Intervention', data=monthly_en_all)

# fix labels to use dates instead of mdates
labels = pd.Series(g.ax.get_xticks()).map(lambda x: mdates.num2date(x).strftime('%Y-%m-%d')).fillna('')
_ = g.set_xticklabels(labels, '', rotation=90)
_ = g.ax.set_xlabel('Date')

We can observe that there seems to be a very wide confidence interval for the regression line after the event. In general: it would seem as though the page views on Wikipedia are trending upward overall. To make a more quantitative analysis we will need to analyze the underlying model of our plot:

## StatsModel regression fit
We use `statsmodels` to further analyze our fitted model:

In [None]:
# event occurred during the middle of this 4 year time period
event = 24
views = monthly_en_all['Page views']
monthly_views_all = pd.DataFrame([{'views': views.iloc[i], 'month': i} for i in range(len(views))])

# provide feature indicating whether event had occurred yet
monthly_views_all.loc[:, 'intervention'] = 1
monthly_views_all.loc[:event, 'intervention'] = 0

# provide feature indicating time-delta from event
monthly_views_all.loc[:, 'postslope'] = np.abs(monthly_views_all.month.values - (event))
monthly_views_all.loc[:event, 'postslope'] = 0
monthly_views_all.head()

In [None]:
# let statsmodels do its magic
reg = smf.ols('views ~ month + C(intervention) + postslope', data=monthly_views_all)
res = reg.fit()
print(res.summary())

From these results we must immediately observe that our model doesn't fit the data very well. The $R^2$ is outside the range of values we would have liked to see. If we compare our results with the results achieved in the paper, we see that we attain completely different coefficients from the original author. To validate our data we changed the time span of our analysis temporarily to be the same as in the original study, and we suddenly found a much better model. This raises some concerns about the conclusions of the original study - we would have liked to see an analysis over a longer time span than was performed there.

The p-value for the coefficient fitted to the intervention variable is quite high, i.e. the probablity of this variable not having any impact on the page views is almost 50%. Furthermore the postslope also raises questions as the p-value for this coefficient is also quite high.

# Comparison with the same articles in different languages : French

# Other Experiments :

We have tried various other ways for dealing with data with a high difference in pageviews, i.e. to not fit our whole hypothesis on the 3 articles that have the most views. The two following experiments didn't lead to significant conclusions but we leave them here for completeness.

## First Experiment : Standardizing per article

In this part, we standardize the views of each articles, thus giving the same weights to all articless.

In [None]:
def standardize(x):
    mean = monthly_en_melt.reset_index().groupby('Article').mean()['views'].loc[x['Article']]
    std  = monthly_en_melt.reset_index().groupby('Article').std()['views'].loc[x['Article']]
    std = std if std != 0 else 1
    return (x['views'] - mean)/std

monthly_en_melt = monthly_en_melt.reset_index()
monthly_en_melt['standardized'] = monthly_en_melt.apply(standardize,axis=1)
monthly_en_melt = monthly_en_melt.set_index('Article')
monthly_en_melt.head()

### Standardized vs total views

In [None]:
fig, axs = plt.subplots(ncols=2, figsize=(10,4))

axs[0].axvline(reveal,c='r')
axs[1].axvline(reveal,c='r')

axs[0].set_title('Standardized')
axs[1].set_title('Views')

monthly_en_melt.groupby('date').sum().reset_index().plot.scatter(x='date', y='standardized', ax=axs[0], rot=90)
monthly_en_melt.groupby('date').sum().reset_index().plot.scatter(x='date', y='views',ax=axs[1],rot=90)

plt.tight_layout()

Given that we have a high amount of articles, even with our quick preprocessing, we have too many articles that have a low amount of views, which makes them inherently have more variance, so it is unfair to give as much weight to these articles as to the other ones.

## Second Experiment : Stratification

Instead of analyzing a single group of articles that have highly different views, we can select groups of articles that have similar viewcounts, and analyze them together.
Given that our distribution of total pageviews per articles can be considered as heavy-tailed, we probably don't want equal width or equal frequency discretization. Instead we opt for clustering to divide our articles into multiple groups, by creating clusters based on the total amount of views.

In [None]:
sorted_views = monthly_en_melt.reset_index().groupby('Article').sum().sort_values('views',ascending=False)
vals = np.log(sorted_views['views'].values.reshape((-1,1))) #Use log scales values because they are too far away

### Selecting number of clusters

Since we don't know exactly how many groups we want to define, we will use the silhouette score and the elbow method to help us.

In [None]:
def plot_sse(features_X, start=2, end=11):
    sse = []
    for k in range(start, end):
        # Assign the labels to the clusters
        kmeans = KMeans(n_clusters=k, random_state=10).fit(features_X)
        sse.append({"k": k, "sse": kmeans.inertia_})

    sse = pd.DataFrame(sse)
    # Plot the data
    plt.plot(sse.k, sse.sse)
    plt.xlabel("K")
    plt.ylabel("Sum of Squared Errors")
    
plot_sse(vals)

In [None]:
from sklearn.metrics import silhouette_score
silhouettes = []

# Try multiple k
for k in range(2, 11):
    # Cluster the data and assigne the labels
    labels = KMeans(n_clusters=k, random_state=10).fit_predict(vals)
    # Get the Silhouette score
    score = silhouette_score(vals, labels)
    silhouettes.append({"k": k, "score": score})
    
# Convert to dataframe
silhouettes = pd.DataFrame(silhouettes)

# Plot the data
plt.plot(silhouettes.k, silhouettes.score)
plt.xlabel("K")
plt.ylabel("Silhouette score")

Given that there is now particular elbow, we take the value with the best silhouette score, K = 5.

In [None]:
NUM_CLUSTERS = 5

clustering = KMeans(n_clusters=NUM_CLUSTERS).fit(vals)

plt.figure(figsize=(20,5))
plt.scatter(x=range(len(sorted_views)),y=sorted_views['views'], c=clustering.labels_)
plt.gca().set_yscale('log')
plt.xlabel('Article')
plt.ylabel('Views')
plt.gca().set_xticks(range(len(sorted_views)))
plt.gca().set_xticklabels(sorted_views.index, rotation=90)

sorted_views['group'] = clustering.labels_
monthly_en_melt['group'] = sorted_views['group']

## Plotting per group

In [None]:
reveal = datetime.date(2013,6,5)
fig, axs = plt.subplots(nrows=NUM_CLUSTERS, figsize=(10,20))

for g,ax in zip(range(len(monthly_en_melt.group.unique())),axs.flatten()):
    #Group by date, select the group
    selected = monthly_en_melt[monthly_en_melt['group'] == g]
    df = selected.groupby('date').sum()

    before = df.loc[:reveal].reset_index()
    after  = df.loc[reveal:].reset_index()
    before.date = mdates.date2num(before.date)
    after.date  = mdates.date2num(after.date)
    
    sns.regplot(x='date',y='views',ax=ax,data=before)
    sns.regplot(x='date',y='views',ax=ax,data=after)
    
    loc = mdates.AutoDateLocator()
    ax.xaxis.set_major_locator(loc)
    ax.xaxis.set_major_formatter(mdates.AutoDateFormatter(loc))
    
    sample = selected.reset_index().groupby('Article').sum().sort_values('views', ascending=False).head(3).index
    sample_str = ','.join(sample)
    ax.set_title(f'Group {g} ({sample_str})')
plt.tight_layout()

Unfortunately, we are not very comfortable on interpreting these results, since they show many different trends, so we would have to go through each individual article and verify what causes the trend, so we also stopped with this analysis. Another issue we thought of is that the clusters we would find on other languages of wikipedia would be different so it would be hard to compare a general trend given that we find no general trend here.