In [2]:
# import necessary libraries
import pandas as pd
import numpy as np
from gensim import corpora
from gensim.models import LdaModel
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from collections import Counter
from itertools import product


import bitermplus as btm
import tmplot as tmp

import torch
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import re

import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel


In [5]:

#LDA==========================================================================================================
#https://radimrehurek.com/gensim/models/ldamodel.html

#read the txt file
with open(r'cleaned_tweets.txt') as file:
    data = file.readlines()

# create a dataframe
df = pd.DataFrame(data, columns=['text'])

# write dataframe to csv file
df.to_csv('cleaned_tweets.csv', index=False)
data = pd.read_csv('cleaned_tweets.csv')

# preprocess text data
stop_words = stopwords.words('english')

def preprocess(text):
    result = []
    for token in simple_preprocess(text):
        if token not in stop_words:
            result.append(token)
    return result

data['processed_text'] = data['text'].apply(preprocess)

# Tokenize the documents
tokenized_docs = [doc.split() for doc in data]
# create dictionary and corpus
dictionary = corpora.Dictionary(data['processed_text'])
corpus = [dictionary.doc2bow(text) for text in data['processed_text']]

# Define the range of hyperparameter values to search over
num_topics_list = [10]
passes_list = [5, 10, 15]
alpha_list = ['symmetric',0.3,0.5,0.7]

# Perform a grid search over the hyperparameter values
results = []
for num_topics, passes , alpha in product(num_topics_list, passes_list,alpha_list):
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=passes,alpha=alpha,)
    coherence_model = CoherenceModel(model=lda_model, texts=tokenized_docs, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    results.append({'num_topics': num_topics,'alpha':alpha, 'passes': passes, 'coherence_score': coherence_score})

# Find the hyperparameters with the highest coherence score
best_params = max(results, key=lambda x: x['coherence_score'])
print('Best parameters:', best_params)


#Best parameters: {'num_topics': 10, 'alpha': 'symmetric', 'passes': 5, 'coherence_score': nan}


  m_lr_i = np.log(numerator / denominator)
  return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2))


Best parameters: {'num_topics': 10, 'alpha': 'symmetric', 'passes': 5, 'coherence_score': nan}


In [None]:
# train LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, alpha='symmetric',passes=5)

# print top topics and their keywords
for topic in lda_model.show_topics(num_topics=num_topics, num_words=10, formatted=False):
    print('Topic {}: {}'.format(topic[0], [word[0] for word in topic[1]]))


In [6]:
#Biterm========================================================================================================
#https://bitermplus.readthedocs.io/en/latest/index.html
#IMPORTING DATA
df = pd.read_csv(
    'cleaned_tweets.csv', header=None, names=['texts'])
texts = df['texts'].str.strip().tolist()
# PREPROCESSING
# Obtaining terms frequency in a sparse matrix and corpus vocabulary
X, vocabulary, vocab_dict = btm.get_words_freqs(texts)
tf = np.array(X.sum(axis=0)).ravel()
# Vectorizing documents
docs_vec = btm.get_vectorized_docs(texts, vocabulary)
docs_lens = list(map(len, docs_vec))
# Generating biterms
biterms = btm.get_biterms(docs_vec)

# Define hyperparameters to tune
M_list = [10, 15, 20]
alpha_list = [0.01, 0.1, 1, 10]
beta_list = [0.01, 0.1, 1, 10]

# Initialize variables to hold best results
best_score = 0
best_params = {}

# Grid search
for M in M_list:
    for alpha in alpha_list:
        for beta in beta_list:
            biterm_model = btm.BTM( X, vocabulary, T=10, M=M, alpha=alpha, beta=beta)
            biterms = biterm_model.fit_transform(docs_vec,biterms)
            score = biterm_model.get_word_score(biterms)
            if score > best_score:
                best_score = score
                best_params = {'M': M, 'alpha': alpha, 'beta': beta}

# Print best parameters and score
print('Best parameters: {}'.format(best_params))
print('Best score: {}'.format(best_score))




KeyboardInterrupt: 

In [None]:
# INITIALIZING AND RUNNING MODEL
model = btm.BTM(
    X, vocabulary, T=10, M=20, alpha=50/8, beta=0.01)
model.fit(biterms, iterations=20)
p_zd = model.transform(docs_vec)

#METRICS
perplexity = btm.perplexity(model.matrix_topics_words_, p_zd, X, 8)
coherence = btm.coherence(model.matrix_topics_words_, X, M=20)

topics = btm.get_top_topic_words(
    model,
    words_num=10,)
print(topics)

In [None]:
#distilbert===================================================================================================
# load the DistilBERT model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

#get document
def get_tweet_embeddings(tweets):
    tweet_embeddings = []
    for tweet in tweets:
        print (len(tweet_embeddings))
        # Remove URLs and mentions
        tweet = re.sub(r"http\S+|@\S+", "", tweet)
        # Tokenize the tweet
        tokens = tokenizer.encode(tweet, add_special_tokens=True)
        # Convert tokens to tensor
        tokens_tensor = torch.tensor(tokens).unsqueeze(0)
        # Get the embeddings
        with torch.no_grad():
            outputs = model(tokens_tensor)
        tweet_embedding = outputs[0][:, 0, :].numpy().squeeze()
        tweet_embeddings.append(tweet_embedding)
    return tweet_embeddings

#perform LDA in document
def perform_lda(embeddings, num_topics):
    print("performing lda training")
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    dtm = vectorizer.fit_transform(embeddings)
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda.fit(dtm)
    return lda, dtm, vectorizer
#train
#read the txt file
df = pd.read_csv(
    'cleaned_tweets.csv', header=None, names=['texts'])
tweets = df['texts'].str.strip().tolist()
tweet_embeddings = get_tweet_embeddings(tweets)
lda, dtm, vectorizer = perform_lda(tweet_embeddings, num_topics=2)

for i, topic in enumerate(lda.components_):
    top_words_idx = topic.argsort()[:-5-1:-1]
    top_words = [vectorizer.get_feature_names()[i] for i in top_words_idx]
    print(f'Topic {i}: {", ".join(top_words)}')

In [None]:
print("\n\n==========================================COHERENCE=============================================\n\n")

#Reg exp for tokenizing the data set
tokenizer = lambda s: re.findall('\w+', s.lower())
text = [tokenizer(t) for t in data]

# Getting Topics
all_topics = topic_model.get_topics()
top = []
keys = []
for x in range(10):
    keys.append(freq['Topic'].head(10)[x])

#Tokenizing
prefix = 'Getting Topics'
pbar2 = tqdm(total=len(keys), position=0, leave=True)
pbar2.set_description(prefix)
for key in tqdm(keys, desc='Getting Topics', position=0, leave=True):
    values = all_topics[key]
    topic_1 = []
    for value in tqdm(values, desc='Retrieving Values in topic ' + str(key), position=0, leave=True):
        topic_1.append(value[0])
    top.append(topic_1)

# Creating a dictionary with the vocabulary
word2id = Dictionary(text)
vec = CountVectorizer()
X = vec.fit_transform(data).toarray()
vocab = np.array(vec.get_feature_names())
# Coherence model
cm = CoherenceModel(topics=top, texts=text, coherence='u_mass', dictionary=word2id)
coherence_per_topic = cm.get_coherence_per_topic()
#Results
print("\n==========================================COHERENCE RESULTS=============================================\n")
for index, x in enumerate(coherence_per_topic):
    print("topic %2d : %5.2f" % (index + 1, x))

coherence = cm.get_coherence()
print(coherence)



In [None]:
#bertopic===================================================================================================

# Read Stopwords_EN_TL.txt and save it into a pandas DataFrame
stop_words_dataframe = pd.read_csv("Stopwords_EN_TL.txt")
stop_words = set(stop_words_dataframe.iloc[:,0])
# Read csv and save into a pandas DataFrame
docs_dataframe = pd.read_csv("cleaned_tweets.txt")
# Remove stopwords for every comment and clean the dataset
docs = []
index = 0
for w in docs_dataframe.iloc[:,0].items():
    series = hero.remove_stopwords(pd.Series(w[1]),stop_words)
    series = hero.preprocessing.clean(series)
    docs.append(series[0])
# Output the cleaned dataset to an excel file
cleaned_dataset = pd.DataFrame(docs)
cleaned_dataset.to_excel("cleaned_tweets.xlsx")
# Initialize the model and fit it to the data

# Hyperparameters:
# language - "english" or "multilingual"
# top_n_words - the top_n_words in each topic (no effect)
# n_gram_range - the n-gram to be used by the vectorizer in the model (no effect / incoherent)
# min_topic_size - how big a topic should be, adjusted to be similar to LDA
# nr_topics - topic reduction, made topics more incoherent

topic_model = BERTopic(min_topic_size=25, language = "multilingual")
topics, probs = topic_model.fit_transform(docs)
# Print the topics found by the model
topics = topic_model.get_topic_info()
topics.to_excel("output.xlsx")
topics
# Extract vectorizer and tokenizer from BERTopic
vectorizer = topic_model.vectorizer_model
tokenizer = vectorizer.build_tokenizer()

# Extract features for Topic Coherence evaluation
tokens = [tokenizer(doc) for doc in docs]
dictionary = corpora.Dictionary(tokens)

topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
               for topic in range(len(set(topics))-1)]

# Evaluate
coherence_model = CoherenceModel(topics=topic_words,
                                 texts=tokens,
                                 dictionary=dictionary, 
                                 coherence='c_v')

# Print Coherence
coherence = coherence_model.get_coherence()
coherence
topic_model.visualize_barchart()