In [7]:
import textacy
import gensim
import textacy.datasets
ds = textacy.datasets.Wikinews(lang="en", version="current")
#ds.download()
#ds.info


In [8]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [9]:
def process_doc(corpus, tokens_only=False):
    for ln, line in enumerate(corpus):
        line = " ".join([l.text for l in line if not l.is_stop])
        #line = line.text
        tokens = gensim.utils.simple_preprocess(line)
        if tokens_only:
            yield tokens
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(tokens, [ln])


In [10]:
corpus = textacy.Corpus.load("en_core_web_trf", "../../models/P76I900/HW4/wikinews_4cls_corpus.bin.gz")

2021-12-07 20:54:01,570 : INFO : loaded 'en_core_web_trf' spaCy language pipeline


In [17]:
train_corpus = list(process_doc(corpus))
#test = [c for c in train_corpus]
#print(test[0])
test_corpus = list(process_doc(corpus, tokens_only=True))

In [18]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=8, epochs=40)

2021-12-07 20:59:21,573 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec(dm/m,d50,n5,w5,mc8,s0.001,t3)', 'datetime': '2021-12-07T20:59:21.573580', 'gensim': '4.1.2', 'python': '3.9.7 (default, Sep 16 2021, 08:50:36) \n[Clang 10.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'created'}


In [19]:
model.build_vocab(train_corpus)

2021-12-07 20:59:23,024 : INFO : collecting all words and their counts
2021-12-07 20:59:23,025 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2021-12-07 20:59:23,076 : INFO : collected 23306 word types and 741 unique tags from a corpus of 741 examples and 226533 words
2021-12-07 20:59:23,077 : INFO : Creating a fresh vocabulary
2021-12-07 20:59:23,107 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=8 retains 4989 unique words (21.406504762722047%% of original 23306, drops 18317)', 'datetime': '2021-12-07T20:59:23.107126', 'gensim': '4.1.2', 'python': '3.9.7 (default, Sep 16 2021, 08:50:36) \n[Clang 10.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'prepare_vocab'}
2021-12-07 20:59:23,108 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=8 leaves 187545 word corpus (82.78926249155752%% of original 226533, drops 38988)', 'datetime': '2021-12-07T20:59:23.108175', 'gensim': '4.1.2', 'python': '3.9.7 (default, Sep 16 

In [20]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

2021-12-07 20:59:25,454 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 3 workers on 4989 vocabulary and 50 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2021-12-07T20:59:25.454509', 'gensim': '4.1.2', 'python': '3.9.7 (default, Sep 16 2021, 08:50:36) \n[Clang 10.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'train'}
2021-12-07 20:59:25,655 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-12-07 20:59:25,662 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-12-07 20:59:25,663 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-12-07 20:59:25,664 : INFO : EPOCH - 1 : training on 226533 raw words (184518 effective words) took 0.2s, 895704 effective words/s
2021-12-07 20:59:25,821 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-12-07 20:59:25,829 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-12-07 20

In [63]:
model.save("../../models/P76I900/HW4/4cls_doc2vec.model")

2021-12-07 21:14:22,828 : INFO : Doc2Vec lifecycle event {'fname_or_handle': '../../models/P76I900/HW4/4cls_doc2vec.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-12-07T21:14:22.828821', 'gensim': '4.1.2', 'python': '3.9.7 (default, Sep 16 2021, 08:50:36) \n[Clang 10.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'saving'}
2021-12-07 21:14:22,830 : INFO : not storing attribute cum_table
2021-12-07 21:14:22,836 : INFO : saved ../../models/P76I900/HW4/4cls_doc2vec.model


In [52]:
t0 = train_corpus[0].words
iv = model.infer_vector(t0)
sims = model.dv.most_similar([iv], topn=5)

for id, sim in sims:
    print(id, sim)


552 0.9759148359298706
0 0.9750752449035645
189 0.6032386422157288
594 0.6012837290763855
674 0.5979137420654297


In [62]:
input_sent = "covid death"
iv = model.infer_vector(input_sent.split())
sims = model.dv.most_similar([iv], topn=3)

for id, sim in sims: print(id, sim)

print(corpus[sims[0][0]])

369 0.6040047407150269
372 0.5973949432373047
617 0.5838903784751892
Monday, March 16, 2020  COVID-19 Related articles 26 October 2021: UK pay freeze on public sector employees will end next year 12 October 2021: Texas governor bans COVID-19 vaccine mandate by any 'entity' 3 October 2021: Rosemary Cousin, Greens candidate in South Gippsland, Australia, speaks to Wikinews about democracy, transport, forests and other local issues 3 October 2021: Australia: Wikinews interviews Les Harmer, South Gippsland local election candidate 27 September 2021: Australia: Wikinews interviews David Liebelt, South Gippsland local election candidate 2019-nCoV, which causes COVID-19 Collaborate! Pillars of Wikinews writing Writing an article On Friday, United States President Donald Trump declared a national emergency in response to the ongoing coronavirus outbreak. The move came two days after the World Health Organization declared the outbreak, which causes the COVID-19 disease, to be a pandemic. The mo