In [None]:
%load_ext autoreload
%autoreload 2

# 1. General Metadata of a Wikipedia Article

Let's say we are interested in the Wikipedia historical evolution of one article, the novel "The Camp of the Saints". We can get some data from Wikipedia itself, and some third party sources:

In [None]:
page_name = 'The_Camp_of_the_Saints'

## 1.1 Basic Info from Wikipedia

In [None]:
from external.wikipedia import WikipediaDV, WikipediaAPI
wikipedia_dv = WikipediaDV(WikipediaAPI(domain='en.wikipedia.org'))
page = wikipedia_dv.get_page(page_name)
page.to_frame('value')

## 1.2 General Stats from Xtools

In [None]:
from external.xtools import XtoolsAPI, XtoolsDV
xtools_api = XtoolsAPI(project = 'en.wikipedia.org')
xtools_dv = XtoolsDV(xtools_api)
page_info = xtools_dv.get_page_info(page_name)
page_info.to_frame('value')

## 1.3 Page Views from Wikimedia

In [None]:
# Query request
from external.wikimedia import WikiMediaDV, WikiMediaAPI
wikimedia_api = WikiMediaAPI(project='en.wikipedia')
wikimedia_dv = WikiMediaDV(wikimedia_api)
views = wikimedia_dv.get_pageviews(page_name, 'daily')

# Visualization
from visualization.listeners import DFListener
from ipywidgets import interact
listener = DFListener(views)
interact(listener.views_per_period, 
         begin=views.timestamp,
         end=views.timestamp.sort_values(ascending=False),
         granularity=['Yearly', 'Monthly', 'Weekly', 'Daily'])

# The df_plotted keeps a reference to the plotted data above
listener.df_plotted['views'].agg({
    'Total views': sum,
    'Max views period': max,
    'Min views period': min,
    'Average views': min,}).to_frame('Value')

## 1.4 Editions per page with Wikiwho

In [None]:
from wikiwho_wrapper import WikiWhoAPI, DataView as WikiWhoDV
import pandas as pd
api = WikiWhoAPI(lng='en')
wikiwho_dv = WikiWhoDV(api)
df = wikiwho_dv.editor_content(page.page_id)
df['year_month'] = pd.to_datetime(df['year_month'])

# Group the data by year month and page (drop the editor information)
df.drop('editor_id', axis=1).groupby(['year_month','page_id']).sum()

# add columns with the total actions
df = df.join(pd.DataFrame(
    df.loc[:,'adds':'adds_stopword_count'].values +\
    df.loc[:,'dels':'dels_stopword_count'].values +\
    df.loc[:,'reins':'reins_stopword_count'].values, 
    index=df.index, 
    columns=['actions', 
             'actions_surv_48h', 
             'actions_persistent', 
             'actions_stopword_count']
))

# Visualization
from visualization.editions_listener import DFListener
from ipywidgets import interact
listener = DFListener(df)
# interact(listener.editions_per_month, 
#          begin=df.year_month,
#          end=df.year_month.sort_values(ascending=False),
#          actions=['All Actions', 'Additions', 'Reinsertions', 'Deletions'])
actions = df.loc[:,'actions':'actions_stopword_count'].columns.append(
    df.loc[:,'adds':'reins_stopword_count'].columns)
interact(listener.editions_per_month, 
         begin=df.year_month,
         end=df.year_month.sort_values(ascending=False),
         granularity=['Yearly', 'Monthly'],
         first_action=actions,
         second_action=actions)


In [None]:
df.head()

In [None]:
from wikiwho_wrapper import WikiWhoAPI, DataView as WikiWhoDV
api = WikiWhoAPI(lng='en')
wikiwho_dv = WikiWhoDV(api)
monthly_editions = wikiwho_dv.editor_content(page.page_id)
df['year_month'] = pd.to_datetime(df['year_month'])

# add columns with the total actions
monthly_editions = monthly_editions.join(pd.DataFrame(
    df.loc[:,'adds':'adds_stopword_count'].values +\
    df.loc[:,'dels':'dels_stopword_count'].values +\
    df.loc[:,'reins':'reins_stopword_count'].values, 
    index=monthly_editions.index, 
    columns=['actions', 'actions_surv_48h', 'actions_persistent', 'actions_stopword_count']
))



In [None]:
%store page

In [None]:
# Actions_per_Edits = ActionsperEdits(article_id)
# TimeSpend = TimeSpentEditing(article_id) 
# req, res = artsum.getArticleInfo(article_name, article_id)
# table = [
#         ['Watchers', req['watchers']],
#         ['Pageviews', req['pageviews']],
#         ['Pageviews Offset', req['pageviews_offset']],
#         ['Revisions', req['revisions']],
#         ['Editors', req['editors']],
#         ['Author', req['author']],
#         ['Created', req['created_at']],
#         ['Creation Rev', req['created_rev_id']],
#         ['Last modified', req['modified_at']],
#         ['Last Rev ID', req['last_edit_id']],
#         ['Total Views', totalViewsM.views.values.sum()],
#         ['Actions per Edits Within 48 Hours', round(Actions_per_Edits.ActionsperEditsWithin48Hour[0], 2)],
#         ['Actions per Edits After 48 Hours', round(Actions_per_Edits.ActionsperEditsAfter48Hour[0], 2)],
#         ['Actions per Edits Exactly in 48 Hours', round(Actions_per_Edits.ActionsperEditsExactlyin48Hour[0], 2)],
#         ['Sessions Within  48 Hours', TimeSpend.SessionsWithin48Hours.item()],
#         ['Sessions After 48 Hours', TimeSpend.SessionsAfter48Hours.item()],
#         ['Sessions Exactly in 48 Hours', TimeSpend.SessionsExactlyin48Hours.item()]
#     ]

# display(HTML(tabulate.tabulate(table, tablefmt='html')))