In [None]:
import pandas as pd
from IPython.core.debugger import set_trace
import datetime
import requests
from tqdm.notebook import tqdm

In [None]:
english_names = pd.read_csv('data/articles.csv', header=None)
english_names.columns = ['Title']
english_names

In [None]:
def get_qid_from_title(title, language):
    response = requests.get(f'https://{language}.wikipedia.org/w/api.php?'
                            f'action=query&prop=pageprops&titles={title}&redirects&format=json')
    try:
        r = [item for item in response.json()['query']['pages'].values()][0]
        qid = r['pageprops']['wikibase_item']
    except KeyError:
        e = 'Article likely not present on Wikipedia'
        raise KeyError(e)
    return qid

### Retrieve wikidata entries for each given article

In [None]:
qids = pd.DataFrame([get_qid_from_title(title, 'en') for title in english_names.Title.values])
qids.head()

### English pageview data

In [None]:
def day_year_to_date(year, days):
    return datetime.datetime(year, 1, 1) + datetime.timedelta(days - 1)

def get_daily_pageviews(titles, language, years):
    data = []
    for title in tqdm(titles):
        response = requests.get(f'http://petermeissner.de:8880/article/exact/{language}/{title}')
        article_data = response.json()['data']
        
        time_data = {'Article': title}
        for yearly_data in article_data:
            if yearly_data['year'] in years:  
                view_counts = yearly_data['page_view_count'].split(',')
                new_entries = {day_year_to_date(int(yearly_data['year']), i+1): int(c) 
                               for i, c in enumerate(view_counts)}
                print(new_entries)
                time_data = {**time_data, 
                             **new_entries}
                
        data.append(time_data)
    
    return pd.DataFrame(data)

In [None]:
privacy_en = get_daily_pageviews(['Tor_(anonymity_network)', 'Tor'], 'en', [2011, 2012, 2013])

# use article name as index
privacy_en.index = privacy_en.Article
privacy_en = privacy_en.drop(['Article'], axis=1)
privacy_en

### Aggregate into monthly data

In [None]:
privacy_en.columns = pd.to_datetime(privacy_en.columns)

# take monthly cumulative
monthly_en = privacy_en.resample('M', axis=1).sum()
monthly_en