# Dependency Installation and Repository Cloning

### Run it if you're using this notebook in Google Colab

Run this block only once

In [None]:
!pip install numpy==1.26.*
import IPython

IPython.Application.instance().kernel.do_shutdown(True)

Clone the git repo and install deps

In [None]:
!git clone 'https://github.com/dakopecky/nlp-course-itmo.git'

%cd nlp-course-itmo
!git checkout hw5
%cd hw5

!pip install poetry
!poetry config virtualenvs.create false
!poetry install --no-ansi

# Topic modeling analysis on Lenta.ru dataset

Import deps

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# This code includes software developed by the following open-source projects:
# - Corus (License: MIT, Authors: Denis Emelin)
# - nltk (License: Apache License 2.0, Authors: Steven Bird, Edward Loper, Ewan Klein)
# - gensim (License: LGPL-2.1 License, Authors: Radim Rehurek, Petr Sojka and Gensim Contributors)
# - numpy (License: BSD-3-Clause license, Authors: NumPy Developers)
# - tqdm (License: MIT License, Authors: Noam Yorav-Raphael)
# - bigartm (License: BSD 3-Clause License, Authors: Konstantin Vorontsov and individual contributors)
# - pyLDAvis (License: BSD 3-Clause License, Authors: Ben Mabey and pyLDAvis Contributors)
# - Pymorphy2 (License: MIT, Authors: Mikhail Korobov)
# - Jupyter Notebook (License: Modified BSD License, Authors: Project Jupyter)
# For the full license information, please see the `licenses` directory.

# bigartm library is not available on some platforms,
# so poetry install in this project may fail for you,
# you can build bigartm by following this guide:
# https://bigartm.readthedocs.io/en/stable/installation/index.html


import os

import numpy as np
from tqdm import tqdm

from corus import load_lenta

import pymorphy2

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

import artm

import gensim
from gensim.models import LdaModel
from gensim.corpora import UciCorpus
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel

Download the "Lenta.ru News" dataset

In [None]:
!wget 'https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.0/lenta-ru-news.csv.gz' -O 'lenta-ru-news.csv.gz'

Define random state for reproducing

In [6]:
RANDOM_STATE = 42

## Loading dataset, Preprocessing & Vectorization

In [7]:
path = 'lenta-ru-news.csv.gz'
records = load_lenta(path)

N = 10000
texts = [record.text for _, record in zip(range(N), records)]

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

In [9]:
morph = pymorphy2.MorphAnalyzer()
stop_words = stopwords.words("russian")

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [morph.parse(token)[0].normal_form for token in tokens if token.isalnum()]
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

tokenized_data = [preprocess(text) for text in tqdm(texts, desc="Preprocessing")]

dictionary = Dictionary(tqdm(tokenized_data, desc="Creating Dictionary"))
corpus = [dictionary.doc2bow(text) for text in tqdm(tokenized_data, desc="Creating Corpus")]

  args, varargs, kw, default = inspect.getargspec(cls.__init__)
Preprocessing: 100%|██████████| 10000/10000 [10:24<00:00, 16.00it/s]
Creating Dictionary: 100%|██████████| 10000/10000 [00:01<00:00, 5406.13it/s]
Creating Corpus: 100%|██████████| 10000/10000 [00:01<00:00, 8364.96it/s]


## Training and visualization of the LDA Model

In [10]:
params_list = [
    {'num_topics': 10, 'passes': 10, 'alpha': 'auto', 'iterations': 50},
    {'num_topics': 15, 'passes': 15, 'alpha': 'auto', 'iterations': 50},
    {'num_topics': 20, 'passes': 20, 'alpha': 'asymmetric', 'iterations': 100},
]

results = []

for params in tqdm(params_list, desc="Model Tuning"):
    temp_model = LdaModel(corpus=corpus, id2word=dictionary,
                          num_topics=params['num_topics'],
                          passes=params['passes'],
                          alpha=params['alpha'],
                          iterations=params['iterations'],
                          random_state=RANDOM_STATE)

    perplexity = temp_model.log_perplexity(corpus)

    coherence_model = CoherenceModel(model=temp_model, texts=tokenized_data, dictionary=dictionary, coherence='c_v')
    coherence = coherence_model.get_coherence()

    results.append({
        'params': params,
        'perplexity': perplexity,
        'coherence': coherence
    })

for res in results:
    print(f"\nParameters: {res['params']}")
    print(f"Perplexity: {res['perplexity']}")
    print(f"Coherence: {res['coherence']}")

Model Tuning: 100%|██████████| 3/3 [09:17<00:00, 185.81s/it]


Parameters: {'num_topics': 10, 'passes': 10, 'alpha': 'auto', 'iterations': 50}
Perplexity: -8.787862505790637
Coherence: 0.46243153876301124

Parameters: {'num_topics': 15, 'passes': 15, 'alpha': 'auto', 'iterations': 50}
Perplexity: -9.191671866728111
Coherence: 0.5332718064333754

Parameters: {'num_topics': 20, 'passes': 20, 'alpha': 'asymmetric', 'iterations': 100}
Perplexity: -9.49912234975262
Coherence: 0.5267851511637739





In [11]:
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=15, random_state=RANDOM_STATE,
                     passes=15, alpha='auto', iterations=50)

In [12]:
vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis_data)

## Training and visualization of BigARTM Model

In [13]:
UciCorpus.serialize('corpus_uci.uci', corpus, dictionary)

os.rename('corpus_uci.uci', 'docword.gensim_corpus.txt')
os.rename('corpus_uci.uci.vocab', 'vocab.gensim_corpus.txt')

In [14]:
batch_vectorizer = artm.BatchVectorizer(data_path='./',
                                        data_format='bow_uci',
                                        collection_name='gensim_corpus',
                                        target_folder='my_artm_model_batches')

T = 20
num_collection_passes = 40

model_artm = artm.ARTM(cache_theta=True, num_topics=T, dictionary=batch_vectorizer.dictionary, seed=RANDOM_STATE)

model_artm.regularizers.add(artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.35))
model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=-0.2))
model_artm.regularizers.add(artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=2))

model_artm.scores.add(artm.PerplexityScore(name='PerplexityScore', dictionary=batch_vectorizer.dictionary))
model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))
model_artm.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore'))

model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes)

print(f"Perplexity: {model_artm.score_tracker['PerplexityScore'].last_value}")
print(f"Sparsity Phi: {model_artm.score_tracker['SparsityPhiScore'].last_value}")
print(f"Sparsity Theta: {model_artm.score_tracker['SparsityThetaScore'].last_value}")

Perplexity: 2551.0283203125
Sparsity Phi: 0.9247840642929077
Sparsity Theta: 0.6470800042152405


In [15]:
phi_matrix_np = model_artm.get_phi().values
theta_matrix_np = model_artm.transform(batch_vectorizer).values

doc_lengths = np.sum(theta_matrix_np, axis=0).astype(int)
term_frequency = np.zeros(len(dictionary))
for doc in corpus:
    for word_id, freq in doc:
        term_frequency[word_id] += freq

vocab = dictionary.values()

prepared_data = pyLDAvis.prepare(
    topic_term_dists=phi_matrix_np.T,
    doc_topic_dists=theta_matrix_np.T,
    doc_lengths=doc_lengths,
    vocab=vocab,
    term_frequency=term_frequency
)

pyLDAvis.display(prepared_data)