In [None]:
import pandas as pd
from utils.lngselection import abbreviation
from wikiwho_wrapper import WikiWho
from external.wikipedia import WikipediaDV, WikipediaAPI
from metrics.conflict import ConflictManager
import numpy as np
import random

In [None]:
%%capture
## Some Extensions ##
%load_ext autoreload
%autoreload 2
%store -r the_page

if 'the_page' not in locals():
    import pickle
    print("Loading default data...")
    the_page = pickle.load(open("data/the_page.p",'rb'))

lng = abbreviation('English')

In [None]:
wikiwho = WikiWho(lng=lng)
all_content = wikiwho.dv.all_content(the_page['page_id'])
revisions = wikiwho.dv.rev_ids_of_article(the_page['page_id'])

con_manager = ConflictManager(all_content.copy(), 
                                           revisions.copy(), 
                                           lng=lng, 
                                           include_stopwords=False)

In [None]:
con_manager.calculate()
token = con_manager.all_actions.copy()
tokens_processed = token[['rev_id', 'rev_time', 'editor', 'token_id', 'token']].groupby("rev_id")['token_id'].apply(lambda group_series: group_series.to_numpy()).reset_index()
tokens_processed['token_id']

In [None]:
for i, row in tokens_processed[np.array(list(map(len,tokens_processed.token_id.values)))==1].iterrows():
    k = random.choice([-1, 1])
    np.append(tokens_processed.loc[i+k, 'token_id'], row['token_id'][0])
tokens_processed = tokens_processed[np.array(list(map(len,tokens_processed.token_id.values)))>1]
token_ids = token[['token', 'token_id']].drop_duplicates()['token_id'].to_numpy()
X = tokens_processed['token_id'].to_numpy()
vocab = token[['token', 'token_id']].drop_duplicates()['token'].to_numpy()


In [None]:
#writing input ids for the C++ model

X_max = np.max([np.max(x) for x in X])
wf = open('BTM/input/input.txt', 'w')
for x in tokens_processed['token_id']:
    print(' '.join(map(str, [str(it) for it in x])), file=wf)

In [None]:
#writing the vocab file for the C++ model
vocab_dict = dict(zip(token_ids, vocab))
with open('BTM/input/vocab.txt', 'w', newline='\n') as f:
    for i in range(X_max):
        if i not in token_ids:
            f.write(str(i) + "\t" + "oo" + "\n")
        else:
            f.write(str(i) + "\t" + vocab_dict[i] + "\n")

# for l in open('../BTM/vocab.txt'):
#     print(l.strip().split('\t')[:2])
#     voca[int(wid)] = w

In [None]:
# INPUT VARIABLES FOR THE MODEL #

K=30   # number of topics
alpha=30    
beta=0.01
niter=150    # number of iterations
save_step=50    # number of steps after which to save

model_dir='../output/model/'

doc_pt='../input/input.txt' # path to the doc with token ids
voca_pt='../input/vocab.txt' #path to the vocabulary

W = X_max #vocab size

In [None]:
#run the model
%cd BTM/script

!mkdir -p {model_dir}
!make -C ../src/
!../src/btm est {K} {W} {alpha} {beta} {niter} {save_step} {doc_pt} {model_dir}

In [None]:
#print output
!python '../script/topicDisplay.py' {model_dir} {K} {voca_pt}