# Merge all tokens with revisions

In [None]:
import pandas as pd
from wikiwho_wrapper import WikiWho
from os.path import exists
ww = WikiWho()
    
pd.set_option('display.max_colwidth', -1)

article = "John Logie Baird"

#filename = 'data/freddie_mercury.df'
filename = f'data/{article}.df'
try:
    #raise Exception()
    df = pd.read_pickle(filename)
except:
    df = ww.dv.all_content(f"{article}")
    df = df[~((df['in'] == -1) & (df['out'] == -1))]
    df = df.drop(columns=['article_title', 'page_id']) 
    df['action'] = 'rein'
    df.loc[df['in']==-1, 'action'] = 'in'
    df.loc[df['in']==-1, 'in'] = df.loc[df['in']==-1, 'o_rev_id']

    revisions = ww.dv.rev_ids_of_article(f'{article}')

    df = pd.merge(df,
                  revisions.rename(
                      columns={
                          'rev_id': 'in',
                          'o_editor': 'in_editor',
                          'rev_time': 'in_rev_time'
                      })[['in', 'in_editor', 'in_rev_time']],
                  how='left', on='in')

    df = pd.merge(df,
                  revisions.rename(
                      columns={
                          'rev_id': 'out',
                          'o_editor': 'out_editor',
                          'rev_time': 'out_rev_time'
                      })[['out', 'out_editor', 'out_rev_time']],
                  how='left', on='out')


    df.to_pickle(filename)

    
    
# idea for a 2nd iteration, look for unrelated signs: parents, nationality, born
india = ['Indian', 'Indians']
persia = ['Parsi', 'Persian', 'Persians']
azeri = ['Azeri']
iran = ['iranian', 'iranians']
brits = ['british']

scottish = ["scottish"]
british = ["british"]

history = ww.dv.rev_ids_of_article(article=article).sort_values('rev_time')
revisions = {}

In [None]:
def vec_dt_replace(series, year=None, month=None, day=None):
    return pd.to_datetime(
        {'year': series.dt.year if year is None else year,
         'month': series.dt.month if month is None else month,
         'day': series.dt.day if day is None else day})

def get_in_contexts(revid, tokenid):
    global revisions
    
    try:
        _df = revisions[revid]
    except:
        print(f'downloading {revid}')
        _df = revisions[revid] = ww.dv.specific_rev_content_by_article_title(
            article=article, rev_id=revid, out=False, _in=False)
        
    idx = _df[_df['token_id'] == tokenid].index[0]
    
    return ' '.join(_df.iloc[idx - 15 : idx ]['token']), ' '.join(_df.iloc[idx + 1 : idx + 15]['token'])
    
def get_out_contexts(revid, tokenid):
    global revisions
    try:
        revid = history[history.shift(-1)['rev_id'] == revid]['rev_id'].iloc[0]
    except Exception as e:
        print('error')
        print(revid)
        print('error')
        raise e
    try:
        _df = revisions[revid]
    except:
        print(f'downloading {revid}')
        _df = revisions[revid] = ww.dv.specific_rev_content_by_article_title(article=article, rev_id=revid)
        
    idx = _df[_df['token_id'] == tokenid].index[0]
    
    return ' '.join(_df.iloc[idx - 15 : idx ]['token']), ' '.join(_df.iloc[idx + 1 : idx + 15]['token'])
    
    
def search_token(df, tokens, _filter='duplicates', i_o='rein'):
    ltokens = [token.lower() for token in tokens]
    
    io_sel=None
    if i_o == 'in':
        io_sel = df[f'{i_o}'] == df[f'o_rev_id']
    elif i_o == 'out':
        io_sel = df[f'{i_o}'] != -1        
    elif i_o == 'rein':
        i_o = 'in'
        io_sel = df[f'{i_o}'] != df[f'o_rev_id']
  
    _df = df.loc[(df['token'].isin(ltokens)) & io_sel, 
                 [f'{i_o}', 'token_id', f'{i_o}_rev_time', f'{i_o}_editor', 'token']]
    
    
    _df[f'{i_o}_rev_time']= pd.to_datetime(_df[f'{i_o}_rev_time'])
    _df = _df.sort_values([f'{i_o}_rev_time','token_id'], ascending=True)
    _df['date'] = vec_dt_replace(_df[f'{i_o}_rev_time'], day=1)

    if _filter is None:
        pass
    if _filter == 'uniques':
        _df = _df.drop_duplicates(subset=f'{i_o}', keep=False)
    elif _filter == 'duplicates':
        _df = _df[_df.duplicated(subset=[f'{i_o}'], keep=False)]
        
    if i_o == 'in':
        _df['left'], _df['right'] = zip(*_df[[f'{i_o}', 'token_id']].apply(
            lambda x: get_contexts(revid = x[f'{i_o}'], tokenid = x['token_id']), axis=1))
    elif i_o == 'out':
        #prev = 
        _df['left'], _df['right'] = zip(*_df[[f'{i_o}', 'token_id']].apply(
            lambda x: get_contexts(revid = history[history.shift(-1)['rev_id'] == x[f'{i_o}']]['rev_id'].iloc[0],
                                   tokenid = x['token_id']), axis=1))

    return _df.set_index(['date', f'{i_o}_rev_time'])



def search_all_tokens(df, tokens, i_o=['in','out']):
    
    # make a recursive call
    if not isinstance(i_o, str):
        return pd.concat((search_all_tokens(df, tokens, i_o=io) for io in i_o), axis=0)
    
    ltokens = [token.lower() for token in tokens]
    
    io_sel=None
    coi = [f'{i_o}', 'token_id', f'{i_o}_rev_time', f'{i_o}_editor', 'token', 'action']
    if i_o == 'out':
        _df = df.loc[(df['token'].isin(ltokens)) & (df[f'{i_o}'] != -1), coi]
        _df['action'] = 'out'
    elif i_o == 'in':
        _df = df.loc[(df['token'].isin(ltokens)), coi]
        
    _df = _df.rename(columns = {
        f'{i_o}': 'rev_id', 
        f'{i_o}_rev_time': 'rev_time', 
        f'{i_o}_editor': 'editor',
    })

    _df[f'rev_time']= pd.to_datetime(_df[f'rev_time'])
    _df = _df.sort_values(['token', f'rev_time','token_id'], ascending=True)
    _df['date'] = vec_dt_replace(_df[f'rev_time'], day=1)
    _df['duplicated'] = _df.duplicated(subset=[f'rev_id'], keep=False)
    

    if i_o == 'in':
        _df['left'], _df['right'] = zip(*_df[[f'rev_id', 'token_id']].apply(
            lambda x: get_in_contexts(revid = x[f'rev_id'], tokenid = x['token_id']), axis=1))
    elif i_o == 'out':
        #prev = 
        _df['left'], _df['right'] = zip(*_df[[f'rev_id', 'token_id']].apply(
            lambda x: get_out_contexts(x[f'rev_id'], tokenid = x['token_id']), axis=1))

    return _df.set_index(['date', f'rev_time'])

In [None]:
def get_rev_history(df, article, rev_id):
    _df = ww.dv.specific_rev_content_by_article_title(article=article, rev_id=rev_id)
    _df = pd.merge(
        _df, df[['token_id', 'in', 'out']], 
        how='left', on='token_id')
    _df = _df[(_df['in'] <= _df['rev_id'])]
    _df.loc[_df['out'] > _df['rev_id'],'out'] = -1
    return _df

rev_history = get_rev_history(df, article, 58551262)

# Query all revisions that reinserted a token

In [None]:
_df = search_all_tokens(df, british + scottish, 'in')

from IPython.core.display import HTML
display(HTML(_df.to_html()))

In [None]:
# uniques | none
_df = search_token(df, scottish, None, 'in')

from IPython.core.display import HTML
display(HTML(_df.to_html()))

# Query all revisions that removed a token

In [None]:
_df = search_all_tokens(df, british + scottish, 'out')

from IPython.core.display import HTML
display(HTML(_df.to_html()))

In [None]:
# uniques | duplicates | none
_df = search_token(df, british, 'duplicates', i_o='out')
    
from IPython.core.display import HTML
display(HTML(_df.to_html()))

# Query all revisions that inserted, reinsert or removed tokens

In [None]:
_df = search_all_tokens(df, british + scottish)

from IPython.core.display import HTML
display(HTML(_df.to_html()))