In [3]:
import requests
import json
import pandas as pd

# Get security-related articles pageviews

In [16]:
ratings = pd.read_csv('data/articles_ratings.csv', index_col=0)
ratings.head()

Unnamed: 0_level_0,Rating
Article,Unnamed: 1_level_1
Datenschutzrecht,4.23
Richtlinie 95/46/EG (Datenschutzrichtlinie),4.08
Datenschutz,4.08
Datenschutzerklärung,4.0
Datenschutz-Grundverordnung,3.92


In [17]:
all_data = []
for article in ratings.index:
    rating = ratings.loc[article, 'Rating']
    url = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/de.wikipedia.org/all-access/user/{}/monthly/20141001/20171031'.format(article)
    r = requests.get(url)
    if r.status_code == 200:
        data = r.json()
        article_data = data['items']
        for item in article_data:
            item['rating'] = rating
        all_data += article_data
    else:
        print('Problem with article "{}".'.format(article))


with open('data/pageviews.json', 'w') as f:
    json.dump(all_data, f)

Problem with article "Richtlinie 95/46/EG (Datenschutzrichtlinie)".
Problem with article "Google LLC".
Problem with article "Agentur der Europäischen Union für Cybersicherheit".
Problem with article "GAFAM".
Problem with article "Brüssel-Effekt".
Problem with article "Meme (Kulturphänomen)".
Problem with article "Verordnung (EU) Nr. 910/2014 (eIDAS-Verordnung)".
Problem with article "Royal Assent".
Problem with article "Tronc (Unternehmen)".
Problem with article "NOYB".


# Get best article for year 2015 from monthly best

In [50]:
n_best = 50

In [71]:
def get_best_articles(df, year, n_best):

    # Sort by number of views
    df.sort_values(by=['Views'], ascending=False, inplace=True)

    display(df.head(5))

    best = df.iloc[:n_best].index
    print("The {} most viewed articles in {}:".format(n_best, year))
    print(list(best))
    
    return best

def get_best_articles_views(articles, year):

    # mtn j'ai les articles, faut récupérer leur pageviews par mois et les sommer.
    all_data = []
    for article in articles:
        url = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/de.wikipedia.org/all-access/user/{}/monthly/{}0101/{}1231'.format(article, year, year)
        r = requests.get(url)
        if r.status_code == 200:
            data = r.json()
            article_data = data['items']
            all_data += article_data
        else:
            print('Problem with article "{}".'.format(article))

    with open('data/topviews-{}.json'.format(year), 'w') as f:
        json.dump(all_data, f)

In [72]:
df_2015 = pd.DataFrame(columns=['Page', 'Views'])
for i in range(7, 13):
    df_temp = pd.read_csv('top_data/topviews-2015_{:02d}.csv'.format(month))
    df_temp = df_temp.loc[:,['Page', 'Views']]
    df_2015 = df_2015.append(df_temp, ignore_index=True)

display(df_2015.sample(5))

# sum all month per article
df_2015_sum = df_2015.groupby('Page').sum()

best_2015 = get_best_articles(df_2015_sum, 2015, n_best)
get_best_articles_views(best_2015, 2015)

Unnamed: 0,Page,Views
4275,Jared Leto,61074
3729,The Big Bang Theory/Episodenliste,43986
3428,Cookie,54950
2895,Weißer Hai,40813
311,Montenegro,68672


Unnamed: 0_level_0,Views
Page,Unnamed: 1_level_1
Template:GeoTemplate,5697246
Nekrolog 2015,3505296
Deutschland,2862084
The 100,2755830
"Empire (Fernsehserie, 2015)",2693106


The 50 most viewed articles in 2015:
['Template:GeoTemplate', 'Nekrolog 2015', 'Deutschland', 'The 100', 'Empire (Fernsehserie, 2015)', 'Lungenembolie', 'Game of Thrones', 'Griechenland', 'Pluto', 'Philipp Mißfelder', 'Tour de France 2015', 'Grexit', 'The Strain (Fernsehserie)', 'Arnold Schwarzenegger', 'Zweiter Prager Fenstersturz', 'Erster Prager Fenstersturz', 'Jan Masaryk', 'The Big Bang Theory', 'Jules Bianchi', 'Jurassic World', 'Orange Is the New Black', 'Aiga Rasch', 'City-Galerie (Augsburg)', 'Homeland (Fernsehserie)', 'Wikipedia', 'Vereinigte Staaten', 'Austerität', 'Sophia Thomalla', 'Massaker von Srebrenica', 'Tour de France', 'Minions', 'Unknown User', 'Terminator: Genisys', 'Frauke Petry', 'Jurassic Park', 'Channing Tatum', 'Schweiz', 'New Horizons', 'Yanis Varoufakis', 'K.I.Z.', 'UEFA Europa League 2015/16', 'Omar Sharif', 'Berlin', 'Two and a Half Men', 'Pretty Little Liars', 'UEFA Champions League 2015/16', 'Terminator (Film)', 'The Mentalist', 'Liste der Kfz-Kennzeich

In [73]:
df_2016 = pd.DataFrame(columns=['Page', 'Views'])
for i in range(1, 13):
    df_temp = pd.read_csv('top_data/topviews-2015_{:02d}.csv'.format(month))
    df_temp = df_temp.loc[:,['Page', 'Views']]
    df_2016 = df_2016.append(df_temp, ignore_index=True)

display(df_2016.sample(5))

# sum all month per article
df_2015_sum = df_2016.groupby('Page').sum()

best_2016 = get_best_articles(df_2016_sum, 2016, n_best)
get_best_articles_views(best_2016, 2016)

Unnamed: 0,Page,Views
564,Sri Lanka,53011
4884,Islamischer Staat (Organisation),155065
6801,Terminator: Genisys,172245
1935,Nekrolog 2015,584216
1897,Modern Family,41490


Unnamed: 0_level_0,Views
Page,Unnamed: 1_level_1
Template:GeoTemplate,5697246
Nekrolog 2015,3505296
Deutschland,2862084
The 100,2755830
"Empire (Fernsehserie, 2015)",2693106


The 50 most viewed articles in 2016:
['Template:GeoTemplate', 'Nekrolog 2015', 'Deutschland', 'The 100', 'Empire (Fernsehserie, 2015)', 'Lungenembolie', 'Game of Thrones', 'Griechenland', 'Pluto', 'Philipp Mißfelder', 'Tour de France 2015', 'Grexit', 'The Strain (Fernsehserie)', 'Arnold Schwarzenegger', 'Zweiter Prager Fenstersturz', 'Erster Prager Fenstersturz', 'Jan Masaryk', 'The Big Bang Theory', 'Jules Bianchi', 'Jurassic World', 'Orange Is the New Black', 'Aiga Rasch', 'City-Galerie (Augsburg)', 'Homeland (Fernsehserie)', 'Wikipedia', 'Vereinigte Staaten', 'Austerität', 'Sophia Thomalla', 'Massaker von Srebrenica', 'Tour de France', 'Minions', 'Unknown User', 'Terminator: Genisys', 'Frauke Petry', 'Jurassic Park', 'Channing Tatum', 'Schweiz', 'New Horizons', 'Yanis Varoufakis', 'K.I.Z.', 'UEFA Europa League 2015/16', 'Omar Sharif', 'Berlin', 'Two and a Half Men', 'Pretty Little Liars', 'UEFA Champions League 2015/16', 'Terminator (Film)', 'The Mentalist', 'Liste der Kfz-Kennzeich

In [74]:
df_2017_sum = pd.read_csv('top_data/topviews-2017.csv'.format(month))
df_2017_sum = df_2017_sum.loc[:,['Page', 'Views']]
df_2017_sum.set_index('Page', inplace=True)

display(df_2017_sum.sample(5))

best_2017 = get_best_articles(df_2017_sum, 2017, n_best)
get_best_articles_views(best_2017, 2017)

Unnamed: 0_level_0,Views
Page,Unnamed: 1_level_1
Fußball-Weltmeisterschaft,755941
Eminem,1087638
Islamischer Staat (Organisation),1102868
NATO,1122314
Katalonien,1570510


Unnamed: 0_level_0,Views
Page,Unnamed: 1_level_1
Nekrolog 2017,6830310
Deutschland,6263143
Game of Thrones,4192243
Unix-Shell,3435287
Donald Trump,3172588


The 50 most viewed articles in 2017:
['Nekrolog 2017', 'Deutschland', 'Game of Thrones', 'Unix-Shell', 'Donald Trump', 'Bitcoin', 'The Kelly Family', 'Vereinigte Staaten', 'Italien', 'Wikipedia', 'Angela Merkel', 'Martin Luther', 'The Walking Dead (Fernsehserie)', 'Berlin', 'Österreich', 'Schweiz', 'Helene Fischer', 'Martin Schulz', 'Bundestagswahl 2017', 'Adolf Hitler', 'Alice Weidel', 'Stranger Things', 'Prison Break', 'Star Wars', 'Präsenz', 'Russland', 'Zweiter Weltkrieg', 'Erster Weltkrieg', 'Ed Sheeran', 'Liste der Kfz-Kennzeichen in Deutschland', 'Bundespräsident (Deutschland)', 'Frauke Petry', 'Europa', 'Elisabeth II.', 'Asperger-Syndrom', 'Vikings (Fernsehserie)', 'Tote Mädchen lügen nicht (Fernsehserie)', 'The Big Bang Theory', 'Pretty Little Liars', 'Emmanuel Macron', 'Liste in Deutschland vorhandener Dampflokomotiven', 'Katalonien', 'Hamburg', 'Nordkorea', 'Grey’s Anatomy', 'Pablo Escobar', 'RMS Titanic', 'Chester Bennington', 'Sahra Wagenknecht', 'Helmut Kohl']
