In [None]:
import pandas as pd
from utils.lngselection import abbreviation
from wikiwho_wrapper import WikiWho
from external.wikipedia import WikipediaDV, WikipediaAPI
from metrics.conflict import ConflictManager
import numpy as np
import random
from BTM.script.topicDisplay import display_topics
import tqdm
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary
import matplotlib.pyplot as plt
from scipy.stats import entropy

In [None]:
# %%capture
# ## Some Extensions ##
# %load_ext autoreload
# %autoreload 2
# %store -r the_page

global lng, the_page

if 'the_page' not in locals():
    import pickle
    print("Loading default data...")
    the_page = pickle.load(open("data/the_page.p",'rb'))

lng = abbreviation('English')

In [None]:
wikiwho = WikiWho(lng=lng)
all_content = wikiwho.dv.all_content(the_page['page_id'])
revisions = wikiwho.dv.rev_ids_of_article(the_page['page_id'])

con_manager = ConflictManager(all_content.copy(), 
                                           revisions.copy(), 
                                           lng=lng, 
                                           include_stopwords=False)

In [None]:
con_manager.calculate()
token = con_manager.all_actions.copy()
tokens_processed = token[['rev_id', 'rev_time', 'editor', 'token_id', 'token']].groupby("rev_id")['token_id'].apply(lambda group_series: group_series.to_numpy()).reset_index()
tokens_processed['token_id']

In [None]:
for i, row in tokens_processed[np.array(list(map(len,tokens_processed.token_id.values)))==1].iterrows():
    k = random.choice([-1, 1])
    np.append(tokens_processed.loc[i+k, 'token_id'], row['token_id'][0])
tokens_processed = tokens_processed[np.array(list(map(len,tokens_processed.token_id.values)))>1]
token_ids = token[['token', 'token_id']].drop_duplicates()['token_id'].to_numpy()
X = tokens_processed['token_id'].to_numpy()
vocab = token[['token', 'token_id']].drop_duplicates()['token'].to_numpy()


In [None]:
#writing input ids for the C++ model

X_max = np.max([np.max(x) for x in X])
wf = open('BTM/input/input.txt', 'w')
for x in tokens_processed['token_id']:
    print(' '.join(map(str, [str(it) for it in x])), file=wf)

In [None]:
#writing the vocab file for the C++ model
vocab_dict = dict(zip(token_ids, vocab))
with open('BTM/input/vocab.txt', 'w', newline='\n') as f:
    for i in range(X_max):
        if i not in token_ids:
            f.write(str(i) + "\t" + "oo" + "\n")
        else:
            f.write(str(i) + "\t" + vocab_dict[i] + "\n")

# for l in open('../BTM/vocab.txt'):
#     print(l.strip().split('\t')[:2])
#     voca[int(wid)] = w

In [None]:
# INPUT VARIABLES FOR THE MODEL #
global niter, save_step, model_dir, doc_pt, voca_pt

K=15   # number of topics
alpha=0.1   
beta=0.01
niter=500    # number of iterations
save_step=100    # number of steps after which to save

model_dir='../output/model/'

doc_pt='../input/input.txt' # path to the doc with token ids
voca_pt='../input/vocab.txt' #path to the vocabulary

W = X_max #vocab size

In [None]:
#run the model
%cd BTM/script

!mkdir -p {model_dir}
!make -C ../src/
!../src/btm est {K} {W} {alpha} {beta} {niter} {save_step} {doc_pt} {model_dir}

In [None]:
#print output
topics, _ = display_topics(model_dir, K, voca_pt, tokens_processed, lng, the_page)

In [None]:
global bow_corpus, dct


def bow_corpus(token_list):
    return [(_id, np.count_nonzero(token_list == _id)) for _id in token_list]

bow_corpus = tokens_processed['token_id'].apply(bow_corpus)
dct = Dictionary.from_corpus(bow_corpus)

cm = CoherenceModel(topics=topics, corpus=bow_corpus, dictionary=dct, coherence='u_mass')
coherence = cm.get_coherence()
coherence

In [None]:
# # Parameter tuning

# # Topics range
# min_topics = 5
# max_topics = 50
# step_size = 5
# topics_range = range(min_topics, max_topics, step_size)

# # Alpha parameter
# alpha = list(np.arange(0.1, 5, 0.3))

# # Beta parameter
# beta = list(np.arange(0.01, 1, 0.3))

# model_results = { 'Topics': [],
#                  'Alpha': [],
#                  'Beta': [],
#                  'Coherence_C_V': [],
#                  'Coherence_C_U_mass': []
#                 }

# #texts in format list of lists of str (with token ids)
# texts = tokens_processed['token_id'].apply(lambda x: x.tolist()).tolist()
# texts = [[str(item) for item in text] for text in texts]

C_v coherence measure is based on a sliding window, one-set segmentation of the top words and an indirect confirmation measure that uses normalized pointwise mutual information (NPMI) and the cosine similarity

In [None]:
# def compute_coherence_values(k, W, a, b, tokens_processed):
#     global niter, save_step, model_dir, doc_pt, voca_pt, bow_corpus, dct, lng, the_page, texts
#     !mkdir -p {model_dir}
#     !make -C ../src/
#     !../src/btm est {k} {W} {a} {b} {niter} {save_step} {doc_pt} {model_dir}
#     topics, _ = display_topics(model_dir, k, voca_pt, tokens_processed, lng, the_page)
#     cm_cv = CoherenceModel(topics=topics, texts = texts, corpus=bow_corpus, dictionary=dct, coherence='c_v')
#     cm_umass = CoherenceModel(topics=topics, corpus=bow_corpus, dictionary=dct, coherence='u_mass')
#     c_v = cm_cv.get_coherence()
#     c_u_mass = cm_umass.get_coherence()
#     return c_v, c_u_mass

# if 1 == 1:
#     pbar = tqdm.tqdm(total=540)

#     # iterate through number of topics
#     for k in topics_range:
#         # iterate through alpha values
#         for a in alpha:
#             # iterare through beta values
#             for b in beta:
#                 # get the coherence score for the given parameters
#                 c_v, c_u_mass = compute_coherence_values(k, W, a, b, tokens_processed)
#                 # Save the model results
#                 model_results['Topics'].append(k)
#                 model_results['Alpha'].append(a)
#                 model_results['Beta'].append(b)
#                 model_results['Coherence_C_V'].append(c_v)
#                 model_results['Coherence_C_U_mass'].append(c_u_mass)

#                 pbar.update(1)
#     btm_cv = pd.DataFrame(model_results)#.to_csv('btm_tuning_results.csv', index=False)
#     pbar.close()

In [None]:
#btm_cv.to_csv('btm_tuning_results.csv', index=False)
btm_cv = pd.read_csv('btm_tuning_results.csv')
btm_cv

In [None]:
btm_cv.iloc[btm_cv['Coherence_C_U_mass'].idxmax()]

In [None]:
btm_cv.iloc[btm_cv['Coherence_C_V'].idxmax()]  #high coherence because tokens mostly appear together in most of the documents 

In [None]:
print('Graph of coherence U_Mass:')
fig, axs = plt.subplots(3,1, figsize=(10, 10))

btm_topics = btm_cv.groupby('Topics')['Coherence_C_U_mass'].agg('max')
btm_alpha = btm_cv.groupby('Alpha')['Coherence_C_U_mass'].agg('max')
btm_beta = btm_cv.groupby('Beta')['Coherence_C_U_mass'].agg('max')
btm_topics.plot.line(x='Topics', y='Coherence_C_U_mass', ax=axs[0])
btm_alpha.plot.line(x='Alpha', y='Coherence_C_U_mass', ax=axs[1])
btm_beta.plot.line(x='Bets', y='Coherence_C_U_mass', ax=axs[2])

In [None]:
print('Graph of coherence C_V:')
fig, axs = plt.subplots(3,1, figsize=(10, 10))

btm_topics = btm_cv.groupby('Topics')['Coherence_C_V'].agg('max')
btm_alpha = btm_cv.groupby('Alpha')['Coherence_C_V'].agg('max')
btm_beta = btm_cv.groupby('Beta')['Coherence_C_V'].agg('max')
btm_topics.plot.line(x='Topics', y='Coherence_C_V', ax=axs[0])
btm_alpha.plot.line(x='Alpha', y='Coherence_C_V', ax=axs[1])
btm_beta.plot.line(x='Bets', y='Coherence_C_V', ax=axs[2])

In [None]:
#best_model 
!mkdir -p {model_dir}
!make -C ../src/
!../src/btm est {10} {W} {2.8} {0.31} {niter} {save_step} {doc_pt} {model_dir}

In [None]:
#output of model with best parameters
best_topics, orig_topics = display_topics(model_dir, 10, voca_pt, tokens_processed, lng, the_page)

best_topics_tokens = [[vocab_dict[int(token)] for token in tokens] for tokens in best_topics]

In [None]:
# 1. Wordcloud of Top N words in each topic
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

i = 0
best_topics_format = []
for it in orig_topics.values():
    items_split = it[0].split()
    items_format = [(item.split(':')[0], float(item.split(':')[1])) for item in items_split]
    best_topics_format.append((i, items_format))
    i += 1

#topics = lda_model.show_topics(formatted=False)

fig, axes = plt.subplots(5, 2, figsize=(10,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(best_topics_format[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

In [None]:
# import gensim, spacy, logging, warnings
# import gensim.corpora as corpora

# id2word = corpora.Dictionary(best_topics_tokens)

# # Create Corpus: Term Document Frequency
# corpus = [id2word.doc2bow(text) for text in best_topics_tokens]

# # Build LDA model
# lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                            id2word=id2word,
#                                            num_topics=4, 
#                                            random_state=100,
#                                            update_every=1,
#                                            chunksize=10,
#                                            passes=10,
#                                            alpha='symmetric',
#                                            iterations=100,
#                                            per_word_topics=True)

In [None]:
# from sklearn.manifold import TSNE
# from bokeh.plotting import figure, output_file, show
# from bokeh.models import Label
# from bokeh.io import output_notebook

# # Get topic weights
# topic_weights = []
# for i, row_list in enumerate(lda_model[corpus]):
#     topic_weights.append([w for i, w in row_list[0]])

# # Array of topic weights    
# arr = pd.DataFrame(topic_weights).fillna(0).values

# # Keep the well separated points (optional)
# arr = arr[np.amax(arr, axis=1) > 0.35]

# # Dominant topic number in each doc
# topic_num = np.argmax(arr, axis=1)

# # tSNE Dimension Reduction
# tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
# tsne_lda = tsne_model.fit_transform(arr)

# # Plot the Topic Clusters using Bokeh
# output_notebook()
# n_topics = 4
# mycolors = np.array([color for name, color in mcolors.TABLEAU_COLORS.items()])
# plot = figure(title="t-SNE Clustering of {} LDA Topics".format(n_topics), 
#               plot_width=900, plot_height=700)
# plot.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], color=mycolors[topic_num])
# show(plot)

In [None]:
# topic_weights = []
# for i, row_list in enumerate(lda_model[corpus]):
#     print(i, row_list)
#     topic_weights.append([w for i, w in row_list[0]])