In [1]:
import logging
import gensim

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# load id->word mapping (the dictionary), one of the results of step 2 above
id2word = gensim.corpora.Dictionary.load_from_text('~/wiki_corpus/wiki_corpus_wordids.txt')
# load corpus iterator
mm = gensim.corpora.MmCorpus('~/wiki_corpus/wiki_corpus_bow.mm')
# mm = gensim.corpora.MmCorpus('wiki_en_tfidf.mm.bz2') # use this if you compressed the TFIDF output (recommended)

print(mm)

2019-07-25 15:31:59,422 : INFO : loaded corpus index from ~/wiki_corpus/wiki_corpus_bow.mm.index
2019-07-25 15:31:59,424 : INFO : initializing cython corpus reader from ~/wiki_corpus/wiki_corpus_bow.mm
2019-07-25 15:31:59,435 : INFO : accepted corpus with 4677490 documents, 100000 features, 741942401 non-zero entries


MmCorpus(4677490 documents, 100000 features, 741942401 non-zero entries)


In [2]:
tfidf_model = gensim.models.TfidfModel.load('~/wiki_corpus/wiki_corpus.tfidf_model')

2019-07-25 15:31:59,442 : INFO : loading TfidfModel object from ~/wiki_corpus/wiki_corpus.tfidf_model
2019-07-25 15:31:59,688 : INFO : loading id2word recursively from ~/wiki_corpus/wiki_corpus.tfidf_model.id2word.* with mmap=None
2019-07-25 15:31:59,689 : INFO : loaded ~/wiki_corpus/wiki_corpus.tfidf_model


In [3]:
import pickle  

with open("wiki_corpus_bow.mm.metadata.cpickle", 'rb') as meta_file:
    docno2metadata = pickle.load(meta_file)

_dict = {v: i for i, (k, v)  in docno2metadata.items()}

In [17]:
page_id = 'John Logie Baird'
doc_num = _dict[page_id]
print("Title: {}".format(docno2metadata[doc_num][1]))

Title: John Logie Baird


In [29]:
vec = tfidf_model[mm[doc_num]]
vector = sorted(vec, key=lambda tup: tup[1], reverse=True)
for pair in vector:
    print(id2word.get(pair[0]), ',', 'tfidf:', pair[1])


baird , tfidf: 0.8669171683854082
television , tfidf: 0.2429489674922876
logie , tfidf: 0.19770613363159387
system , tfidf: 0.10279001416551234
colour , tfidf: 0.1004621668460217
demonstration , tfidf: 0.0703660638893353
bbc , tfidf: 0.07009525017499853
electronic , tfidf: 0.06687459339686158
image , tfidf: 0.06650748854043548
demonstrated , tfidf: 0.05637278499837029
radar , tfidf: 0.05632979888227803
patent , tfidf: 0.054976992287933436
helensburgh , tfidf: 0.05399962765724461
london , tfidf: 0.05227949785041346
transmission , tfidf: 0.05094144509694618
glasgow , tfidf: 0.05025095517923627
korn , tfidf: 0.044864230325756156
broadcast , tfidf: 0.04357590701697164
transmitting , tfidf: 0.03952968437676656
images , tfidf: 0.03619769629268099
farnsworth , tfidf: 0.03493307883644139
mechanical , tfidf: 0.03399120649958646
inventors , tfidf: 0.03169270753523256
scanned , tfidf: 0.0316896258482618
line , tfidf: 0.0315195442048201
palace , tfidf: 0.030447178124556756
scanning , tfidf: 0.0298

experimented , tfidf: 0.009377053650752533
remarkably , tfidf: 0.009375018673529871
health , tfidf: 0.009337963717337524
reported , tfidf: 0.009337316361345974
vertically , tfidf: 0.009333815888478357
cables , tfidf: 0.00932364985670356
wax , tfidf: 0.009260558164687764
oceans , tfidf: 0.009259833008943742
committee , tfidf: 0.009224517303702284
ground , tfidf: 0.009218652344670375
reprint , tfidf: 0.00915914152639823
blue , tfidf: 0.009148891670729457
success , tfidf: 0.009145983707403335
fire , tfidf: 0.009128935761369769
coliseum , tfidf: 0.009122278991831606
antony , tfidf: 0.009101628564903857
pal , tfidf: 0.009093160610430511
jessie , tfidf: 0.009082157959444882
irvine , tfidf: 0.009047602266939951
patented , tfidf: 0.009001291790974456
discs , tfidf: 0.008989476626205497
crossroads , tfidf: 0.008966477243081144
moderately , tfidf: 0.008964327629107698
paddy , tfidf: 0.008944763452612711
filter , tfidf: 0.008942409086949559
disastrous , tfidf: 0.008929175113872206
rid , tfidf: 0.

hotel , tfidf: 0.005631544825881497
secret , tfidf: 0.005626100288232216
ray , tfidf: 0.0056188316797975086
looking , tfidf: 0.0056075167862944
introduction , tfidf: 0.005605597834144688
concept , tfidf: 0.005594147753904167
obtained , tfidf: 0.005576896179519273
god , tfidf: 0.00554477423896985
arthur , tfidf: 0.005544686508737457
rate , tfidf: 0.005526857614832826
suffered , tfidf: 0.005514669195504465
levels , tfidf: 0.005512864074328502
industrial , tfidf: 0.005510632133517966
publishing , tfidf: 0.0055092357750298
fell , tfidf: 0.005501054974152358
ability , tfidf: 0.005498074466024262
bar , tfidf: 0.005491150695482157
adopted , tfidf: 0.005488225783314027
entry , tfidf: 0.00548809296662901
surface , tfidf: 0.005482308622200235
wood , tfidf: 0.005481310644895632
buried , tfidf: 0.005464278281421516
inspired , tfidf: 0.005462401501358967
capacity , tfidf: 0.00546097764817114
web , tfidf: 0.005457861701746537
purchased , tfidf: 0.005455189222540453
job , tfidf: 0.0054495758108229105

In [19]:
from wikiwho_wrapper import WikiWho
ww = WikiWho(lng='en')
df = ww.dv.last_rev_content(article=page_id) 


In [20]:
ww_text = ''
for word in df['token']:
    ww_text = ww_text + ' ' + word
    

In [25]:
ww_text = ww_text.replace('[[', '').replace(']]', '').replace('}}', '').replace('{{', '').replace('–', '')

In [26]:
from rake_nltk import Rake

r = Rake() # Uses stopwords for english from NLTK, and all puntuation characters.
r.extract_keywords_from_text(ww_text)
wd = r.get_word_degrees()
sorted(wd.items(), key=lambda kv: kv[1], reverse=True)

[('television', 230),
 ('baird', 168),
 ('system', 95),
 ('first', 92),
 ('colour', 66),
 ('category', 53),
 ('logie', 41),
 ('scottish', 40),
 ('electronic', 39),
 ('john', 37),
 ('broadcast', 36),
 ('image', 32),
 ('line', 30),
 ('company', 29),
 ('early', 28),
 ('used', 27),
 ('using', 26),
 ('bbc', 24),
 ('images', 24),
 ('august', 23),
 ('january', 22),
 ('british', 21),
 ('1888', 21),
 ('1946', 21),
 ('also', 20),
 ('14', 20),
 ('tv', 19),
 ('would', 19),
 ('palace', 18),
 ('ltd', 17),
 ('pictures', 17),
 ('telechrome', 17),
 ('signal', 16),
 ('13', 16),
 ('london', 16),
 ('nipkow', 16),
 ('moving', 16),
 ('two', 16),
 ('2015', 16),
 ('june', 16),
 ('picture', 15),
 ('demonstrated', 15),
 ('crystal', 15),
 ('fully', 15),
 ('3', 15),
 ('mechanical', 15),
 ('—', 15),
 ('years', 15),
 ('including', 15),
 ('february', 14),
 ('working', 14),
 ('live', 14),
 ('2', 13),
 ('october', 13),
 ('people', 13),
 ('apparatus', 13),
 ('pioneer', 13),
 ('many', 13),
 ('electron', 13),
 ('high', 1