# Background

These notebook shows how to find all the revisions that involve a given word in a document. In practice, this means, all the times that word was inserted, reinserted or removed. It also locates the left and right context of the word in that given revision. 


The first step is selecting a page


# Select a page

In [None]:
article = "John Logie Baird"

# This caches the revisions tokens, it is better not to overwrite to avoid
# downloading the revisions multiple times.
revisions_cache = {}

# Merge all tokens with revisions

Here, the ins and outs (columns `in` and `out` contains revisions id) of all_tokens of an article are merged with the revision in order to also have the revision timestamp and the editor of the revision. 

Note that the merge happens twice, one for the ins and one for the outs

In [None]:
import pandas as pd
from wikiwho_wrapper import WikiWho
from os.path import exists

# this will print all the cases
pd.set_option('display.max_colwidth', -1)

# wikiwho instance
ww = WikiWho()

# get all the content
df = ww.dv.all_content(f"{article}")

# clean insertions that only happen once
# df = df[((df['in'] == -1) & (df['out'] == -1))]

# drop useless colummns
df = df.drop(columns=['article_title', 'page_id']) 

# by default all insertions are consider reinsertions
df['action'] = 'rein'

# fix the above line, first insertions should be in
df.loc[df['in']==-1, 'action'] = 'in'

# place the o_rev_id as the real insertion
df.loc[df['in']==-1, 'in'] = df.loc[df['in']==-1, 'o_rev_id']

# get all the revision data
revisions = ww.dv.rev_ids_of_article(f'{article}')

# merge the in revisions ids with the revisions
df = pd.merge(df,
              revisions.rename(
                  columns={
                      'rev_id': 'in',
                      'o_editor': 'in_editor',
                      'rev_time': 'in_rev_time'
                  })[['in', 'in_editor', 'in_rev_time']],
              how='left', on='in')

# merge the out revisions ids with the revisions
df = pd.merge(df,
              revisions.rename(
                  columns={
                      'rev_id': 'out',
                      'o_editor': 'out_editor',
                      'rev_time': 'out_rev_time'
                  })[['out', 'out_editor', 'out_rev_time']],
              how='left', on='out')


# sort the revisions
history = revisions.sort_values('rev_time')


# Define some methods

This methods do the heavy lifting of the process

In [None]:
def get_rev_history(df, article, rev_id):
    """Get the actions performed until certain revision"""
    
    _df = ww.dv.specific_rev_content_by_article_title(article=article, rev_id=rev_id)
    _df = pd.merge(
        _df, df[['token_id', 'in', 'out']], 
        how='left', on='token_id')
    _df = _df[(_df['in'] <= _df['rev_id'])]
    _df.loc[_df['out'] > _df['rev_id'],'out'] = -1
    return _df

def create_download_link(df, filename, title = "Download CSV file", ):
    """Create csv and link to download the processed filed"""
    df.to_csv(filename, sep = ';')
    html = f'<a href="{filename}" target="_blank">{title}</a>'
    return HTML(html)

def vec_dt_replace(series, year=None, month=None, day=None):
    """This will generate a date from the above parameters. It is useful to create
    indexes by month"""
    return pd.to_datetime(
        {'year': series.dt.year if year is None else year,
         'month': series.dt.month if month is None else month,
         'day': series.dt.day if day is None else day})

def get_in_contexts(revid, tokenid):
    """get the left and right context of insertions and reinsertions"""
    global revisions
    
    try:
        _df = revisions_cache[revid]
    except:
        print(f'downloading {revid}')
        _df = revisions_cache[revid] = ww.dv.specific_rev_content_by_article_title(
            article=article, rev_id=revid, out=False, _in=False)
        
    idx = _df[_df['token_id'] == tokenid].index[0]
    
    return revid, idx, ' '.join(_df.iloc[idx - 15 : idx ]['token']), ' '.join(_df.iloc[idx + 1 : idx + 15]['token'])

    
def get_out_contexts(revid, tokenid):
    """get the left and right context of removes"""
    global revisions

    try:
        revid = history[history.shift(-1)['rev_id'] == revid]['rev_id'].iloc[0]
    except Exception as e:
        print('error')
        print(revid)
        print('error')
        raise e
    try:
        _df = revisions_cache[revid]
    except:
        print(f'downloading {revid}')
        _df = revisions_cache[revid] = ww.dv.specific_rev_content_by_article_title(article=article, rev_id=revid)
        
    idx = _df[_df['token_id'] == tokenid].index[0]
    
    return revid, idx, ' '.join(_df.iloc[idx - 15 : idx ]['token']), ' '.join(_df.iloc[idx + 1 : idx + 15]['token'])
    

def search_all_tokens(df, tokens, i_o=['in','out'], with_context=False):
    """Search all times that actions were performed in the document"""
    
    # make a recursive call
    if not isinstance(i_o, str):
        return pd.concat((search_all_tokens(df, tokens, i_o=io, with_context=with_context) for io in i_o), axis=0)

    ltokens = [token.lower() for token in tokens]
    
    io_sel=None
    coi = [f'{i_o}', 'token_id', f'{i_o}_rev_time', f'{i_o}_editor', 'token', 'action']
    if i_o == 'out':
        _df = df.loc[(df['token'].isin(ltokens)) & (df[f'{i_o}'] != -1), coi]
        _df['action'] = 'out'
    elif i_o == 'in':
        _df = df.loc[(df['token'].isin(ltokens)), coi]
        
    _df = _df.rename(columns = {
        f'{i_o}': 'rev_id', 
        f'{i_o}_rev_time': 'rev_time', 
        f'{i_o}_editor': 'editor',
    })

    _df[f'rev_time']= pd.to_datetime(_df[f'rev_time'])
    _df = _df.sort_values(['token', f'rev_time','token_id'], ascending=True)
    _df['date'] = vec_dt_replace(_df[f'rev_time'], day=1)
    _df['duplicated'] = _df.duplicated(subset=[f'rev_id'], keep=False)
    

    if len(_df) > 0 and with_context:
        if i_o == 'in':
            _df['revid_ctxt'], _df['pos'], _df['left'], _df['right'] = zip(*_df[[f'rev_id', 'token_id']].apply(
                lambda x: get_in_contexts(revid = x[f'rev_id'], tokenid = x['token_id']), axis=1))
        elif i_o == 'out':
            _df['revid_ctxt'], _df['pos'], _df['left'], _df['right'] = zip(*_df[[f'rev_id', 'token_id']].apply(
                lambda x: get_out_contexts(x[f'rev_id'], tokenid = x['token_id']), axis=1))
    else:
        _df['left'] = ''
        _df['right'] = ''
        _df['pos'] = -1
        _df['revid_ctxt'] = -1

    return _df.set_index(['date', f'rev_time'])

# Select the tokens that will be searched

In [None]:
tokens = ["scottish", "british"]

# Query all revisions that inserted, reinsert or removed tokens

In [None]:
_df = search_all_tokens(df, tokens, with_context=True)

from IPython.core.display import display, HTML
display(create_download_link(_df, f'data/{article}.csv'))
_df

In [None]:
' '.join(revisions_cache[771998521].token)
#revisions_cache[771998521]