In [15]:
import pandas as pd
import re
import string

#change dataset path
df = pd.read_csv('data/processed/en_tweets_processed.csv')
df = df[df.language == 'en']
df = df[~df['text'].isna()]
preprocessed_documents = df['text'].tolist()


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from gensim import matutils

tfidf_vectorizer = TfidfVectorizer(max_df=0.9, min_df=4,stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(preprocessed_documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()


In [17]:
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from multiprocessing import Pool

def preprocess_document(doc, tfidf_feature_names_set):
    # Lower and split the document only once, and filter using the set
    return [word for word in doc.lower().split() if word in tfidf_feature_names_set]


def train_NMF(n_components):
    model = NMF(n_components=n_components, init='random', random_state=0)
    W = model.fit_transform(tfidf)
    H = model.components_

    # Extract the top words for each topic
    n_top_words = 10
    topics = []
    for topic_idx, topic in enumerate(H):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [tfidf_feature_names[i] for i in top_features_ind]
        topics.append(top_features)


    # Convert tfidf_feature_names to a set for faster lookup
    tfidf_feature_names_set = set(tfidf_feature_names)

    # Use parallel processing to optimize the conversion of documents into lists of words
    with Pool() as pool:
        texts = pool.starmap(preprocess_document, [(doc, tfidf_feature_names_set) for doc in preprocessed_documents])

    # Create a Gensim dictionary
    dictionary = Dictionary(texts)

    # Convert the dictionary and the corpus
    corpus = [dictionary.doc2bow(text) for text in texts]

    # Calculate the coherence score using Gensim
    coherence_model = CoherenceModel(topics=topics, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()

    print('Coherence Score: ', coherence_score, n_components)
    return coherence_score,topics

In [18]:
n_components = [5,10,15,20,25,30,35,40,45,50]

coherences = []
all_topics = []
for _n_components in n_components:
    coherence_score, topics = train_NMF(_n_components)
    coherences.append(coherence_score)
    all_topics.append(topics)
    

Coherence Score:  0.5817666666622392 5
Coherence Score:  0.5731984235089588 10
Coherence Score:  0.5155584780161389 15
Coherence Score:  0.501634068800048 20
Coherence Score:  0.48121169629433086 25
Coherence Score:  0.4499379122372345 30
Coherence Score:  0.4355715944951517 35




Coherence Score:  0.44321204462994324 40
Coherence Score:  0.4457580068424277 45
Coherence Score:  0.442477204365008 50


In [20]:
all_topics[0]

[['nuclear',
  'plant',
  'power',
  'zaporizhzhia',
  'russia',
  'disaster',
  'europe',
  'largest',
  'grid',
  'warns'],
 ['hire',
  'passive',
  'term',
  'awesome',
  'website',
  'create',
  'income',
  'autopilot',
  'developer',
  'clickbank'],
 ['war',
  'russia',
  'putin',
  'like',
  'people',
  'world',
  'just',
  'don',
  'stop',
  'day'],
 ['russian',
  'ukrainian',
  'military',
  'video',
  'forces',
  'air',
  'explosions',
  'defense',
  'region',
  'occupied'],
 ['ukraine',
  'support',
  'help',
  'glory',
  'aid',
  'weapons',
  'new',
  'million',
  'russia',
  'crimea']]