In [None]:
import logging
import gensim

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# load id->word mapping (the dictionary), one of the results of step 2 above
id2word = gensim.corpora.Dictionary.load_from_text('~/wiki_corpus/wiki_corpus_wordids.txt')
# load corpus iterator
mm = gensim.corpora.MmCorpus('~/wiki_corpus/wiki_corpus_bow.mm')
# mm = gensim.corpora.MmCorpus('wiki_en_tfidf.mm.bz2') # use this if you compressed the TFIDF output (recommended)

print(mm)

In [None]:
tfidf_model = gensim.models.TfidfModel.load('~/wiki_corpus/wiki_corpus.tfidf_model')

In [None]:
import pickle  

with open("wiki_corpus_bow.mm.metadata.cpickle", 'rb') as meta_file:
    docno2metadata = pickle.load(meta_file)

_dict = {v: i for i, (k, v)  in docno2metadata.items()}

In [None]:
page_id = 'John Logie Baird'
doc_num = _dict[page_id]
print("Title: {}".format(docno2metadata[doc_num][1]))

In [None]:
vec = tfidf_model[mm[doc_num]]
vector = sorted(vec, key=lambda tup: tup[1], reverse=True)
for pair in vector:
    print(id2word.get(pair[0]), ',', 'tfidf:', pair[1])


In [None]:
from wikiwho_wrapper import WikiWho
ww = WikiWho(lng='en')
df = ww.dv.last_rev_content(article=page_id) 


In [None]:
ww_text = ''
for word in df['token']:
    ww_text = ww_text + ' ' + word
    

In [None]:
ww_text = ww_text.replace('[[', '').replace(']]', '').replace('}}', '').replace('{{', '').replace('–', '')

In [None]:
from rake_nltk import Rake

r = Rake() # Uses stopwords for english from NLTK, and all puntuation characters.
r.extract_keywords_from_text(ww_text)
wd = r.get_word_degrees()
sorted(wd.items(), key=lambda kv: kv[1], reverse=True)

### TFIDF and RAKE with chobs

In [None]:
from wikiwho_chobj import Chobjer
import pandas as pd
co = Chobjer(article="39570", pickles_path='../../bert', lang='en', context=5)
chobs = pd.DataFrame(co.iter_chobjs(), columns = next(co.iter_chobjs()).keys())

In [None]:
right_tokens = []
left_tokens = []
for i, row in chobs.iterrows():
    left_tokens.append(' '.join(word for word in chobs['left_token_str'][i]))
    right_tokens.append(' '.join(word for word in chobs['right_token_str'][i]))
    left_tokens[i] = left_tokens[i].replace('[[', '').replace(']]', '').replace('}}', '').replace('{{', '').replace('–', '').replace('\'', '')
    right_tokens[i] = right_tokens[i].replace('[[', '').replace(']]', '').replace('}}', '').replace('{{', '').replace('–', '')

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

In [None]:
left_words = [
    [word for word in line.lower().split() if word not in stopwords.words('english')]
    for line in left_tokens
]

dct_left = gensim.corpora.Dictionary(left_words)
corpus_left = [dct_left.doc2bow(word) for word in left_words]
model_left = gensim.models.TfidfModel(corpus_left)

In [None]:
for i, item in enumerate(corpus_left):
    vec = model_left[corpus_left[i]]
    vector = sorted(vec, key=lambda tup: tup[1], reverse=True)
    for pair in vector:
        print(dct_left.get(pair[0]), ',', 'tfidf:', pair[1])


In [None]:
right_words = [
    [word for word in line.lower().split() if word not in stopwords.words('english')]
    for line in right_tokens
]

dct_right = gensim.corpora.Dictionary(right_words)
corpus_right = [dct_right.doc2bow(word) for word in right_words]
model_right = gensim.models.TfidfModel(corpus_right)

In [None]:
for i, item in enumerate(corpus_right):
    vec = model_right[corpus_right[i]]
    vector = sorted(vec, key=lambda tup: tup[1], reverse=True)
    for pair in vector:
        print(dct_left.get(pair[0]), ',', 'tfidf:', pair[1])