In [None]:
from datetime import datetime, timedelta,timezone
from db import Model, Session, engine
from models import Tweet, Company
from sqlalchemy import select

import pandas as pd
import numpy as np
import emoji

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re #regular expression
import spacy
from matplotlib import pyplot as plt
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis, pyLDAvis.lda_model

from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.matutils import Sparse2Corpus
from gensim.models.coherencemodel import CoherenceModel

from bertopic import BERTopic

from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import pipeline as hf_pipeline

import joblib






In [None]:
# nltk.download('punkt')
# nltk.download('wordnet')


In [None]:
pyLDAvis.enable_notebook()

In [None]:
set_config(display='diagram')

Prepare dataset

In [None]:
query = select(Tweet).where(Tweet.company_id == 1)

In [None]:
print(query)

In [None]:
#Test query output
with Session() as session:
    result = session.scalars(query).all()

In [None]:
len(result)

In [None]:
result[0].date

In [None]:
# define emoji removal helper function
def remove_emojis(text):
    return emoji.replace_emoji(text,replace="")

def decode_emojis(text):
    return emoji.demojize(text, delimiters=("", ""))  # Removes colons from the description


In [None]:
tweet_id = [tweet.id for tweet in result]
tweet_text = [tweet.text for tweet in result]
tweet_date = [tweet.date.astimezone(timezone.utc) for tweet in result]

In [None]:
tweets_df = pd.DataFrame({'id': tweet_id,
                         'text':tweet_text,
                         'date':tweet_date})

In [None]:
tweets_df.head()

In [None]:
tweets_df['text'] = tweets_df['text'].apply(remove_emojis)

In [None]:
tweets_df.info()

In [None]:
tweets_df.head()

In [None]:
filtered_df = tweets_df.loc[tweets_df['text'].str.contains('@gtbank', case=False)]

In [None]:
filtered_df.head()

In [None]:
# start preprocessing and pipeline creation
class TextPreprocessor(TransformerMixin):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        processed_texts = []
        for text in X:
            tokens = nltk.word_tokenize(text)
            lemmatized_tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
            processed_texts.append(' '.join(lemmatized_tokens))
        return processed_texts

In [None]:
preprocessing_pipeline = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vect', CountVectorizer(stop_words='english'))])  # Custom preprocessing

In [None]:
processed_tweets = preprocessing_pipeline.fit_transform(filtered_df['text'])

In [None]:
print(type(processed_tweets))

In [None]:
processed_tweets[0].todense().shape

In [None]:
filtered_df['text'].iloc[0]

In [None]:
nlp = spacy.load("en_core_web_sm")

def normalize_text(documents,
                   min_token_len=1,
                   irrelevant_pos=['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE']):
    """
    Given text, min_token_len, and irrelevant_pos carry out preprocessing of the text
    and return a preprocessed string.

    Keyword arguments:
    documents -- (np.array[str]) the list of documents to be preprocessed
    min_token_len -- (int) min_token_length required
    irrelevant_pos -- (list) a list of irrelevant pos tags

    Returns: np.array[str] the normalized documents
    """
    normalized_documents = []

    for text in documents:
        #print(text)
        # Remove Emails
        text = re.sub(r'\S*@\S*\s?', '', text)

        # Remove extra space characters
        text = re.sub(r'\s+', ' ', text)

        # Remove distracting characters
        text = re.sub(r'''[\*\~]+''', "", text)

        doc = nlp(text) #covert text into spacy object
        clean_text = []

        for token in doc:
            if (token.is_stop == False # Check if it's not a stopword
                and token.is_alpha # Check if it's an alphanumerics char
                and len(token) > min_token_len # Check if the word meets minimum threshold
                and token.pos_ not in irrelevant_pos): # Check if the POS is in the acceptable POS tags
                lemma = token.lemma_ # Take the lemma of the word
                clean_text.append(lemma)

        clean_text = ' '.join(clean_text) #merge list of tokens back into string
        normalized_documents.append(clean_text) #append to list of normalized documents

    normalized_documents = np.array(normalized_documents) #convert list of normalized documents into numpy array
    return normalized_documents

# Create a Transformer from the function so that we can use it in a Pipeline
normalizer = FunctionTransformer(normalize_text)

In [None]:
test_str = filtered_df['text'].iloc[0]

In [None]:
print(f'Prior to normalization:\n{test_str}')
print(f'After normalization:\n{normalizer.transform([test_str,])}')

In [None]:
#grouped_df.info()
filtered_df.shape[0]

In [None]:
# # Create weekly bins and group by these bins
# filtered_df['datetime'] = pd.to_datetime(filtered_df['date'])
# filtered_df['weekly_bins'] = filtered_df['date'].dt.to_period('W')
# grouped_df = filtered_df.groupby('weekly_bins')['text'].agg(' '.join).reset_index()

In [None]:
# Get the built-in stopword list
builtin_stopwords = list(ENGLISH_STOP_WORDS)
# Your custom stopwords
custom_stopwords = ['una', 'dey', 'come','dm','pls','guy',
                    'hi','try','hello','god','gtb','gtbank','nigeria','till','gt',
                   'send','month','week','day','february','don','useless','want',
                    'people','know','abeg']
# Combine the stopword lists
all_stopwords = builtin_stopwords + custom_stopwords

In [None]:
len(custom_stopwords)

In [None]:
n_features = 5000

#keep 5000 most common tokens that appear in atleast 2 documents, less than 95% of documents
#notice binary=False by default
vectorizer = CountVectorizer(min_df=50, max_df=0.95, 
                             max_features=n_features,
                             ngram_range=(1, 3),
                             stop_words=all_stopwords)

In [None]:
preprocessor = Pipeline([('normalizer', normalizer),
                         ('vectorizer', vectorizer)])

In [None]:
n_topics = 15

#alpha = doc_topic_prior = 1 / n_components (every topic is equally likely in a document)
#eta = topic_word_prior = 1 / n_components (every word is equally likely in a topic)
lda = LatentDirichletAllocation(n_components=n_topics,
                                max_iter=10,
                                doc_topic_prior = 0.01,
                                topic_word_prior = 0.91,
                                learning_method='batch',
                                random_state=27)

pipeline = Pipeline([('preprocessor', preprocessor),
                     ('model', lda)])

In [None]:
pipeline.fit(filtered_df['text'])
print("done")

In [None]:
filtered_df['text'].iloc[0:2]

In [None]:
[print(max(range(len(topic)), key=topic.__getitem__)) for topic in pipeline.transform(filtered_df['text'].iloc[0:10])]

In [None]:
n_topics

In [None]:
print(pipeline)

In [None]:
def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(3, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f'Topic {topic_idx +1}',
                     fontdict={'fontsize': 30})
        ax.invert_yaxis()
        ax.tick_params(axis='both', which='major', labelsize=20)
        for i in 'top right left'.split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)

plot_top_words(lda, vectorizer.get_feature_names_out(), 10, 'Topics in LDA model')

In [None]:
data_vectorized = preprocessor.transform(filtered_df['text'])
print("done")

In [None]:
vis = pyLDAvis.lda_model.prepare(lda, data_vectorized, vectorizer, mds='pcoa',sort_topics=False)
vis

In [None]:
normalizer_pipeline = Pipeline([('normalizer', normalizer)])
data_normalized = normalizer_pipeline.fit_transform(filtered_df['text'])
tokenized_docs = [word_tokenize(doc.lower()) for doc in data_normalized]

In [None]:
n_features = 5000

#keep 5000 most common tokens that appear in atleast 2 documents, less than 95% of documents
#notice binary=False by default
vectorizer = CountVectorizer(min_df=2, max_df=0.75, max_features=n_features)

preprocessor = Pipeline([('normalizer', normalizer),
                         ('vectorizer', vectorizer)])

In [None]:
# Let's assume 'pipeline' is your sklearn pipeline ending with CountVectorizer
processed_corpus = preprocessor.fit_transform(filtered_df['text'])

# Convert sklearn's document-term matrix to Gensim's corpus format
gensim_corpus = Sparse2Corpus(processed_corpus, documents_columns=False)

# Create a Gensim dictionary
gensim_dictionary = Dictionary.from_corpus(gensim_corpus, id2word=dict((id, word) 
                                        for word, id in preprocessor.named_steps['vectorizer'].vocabulary_.items()))



In [None]:
# Now, you can create a Gensim LDA model with this corpus and dictionary
lda_model = LdaModel(corpus=gensim_corpus, 
                     id2word=gensim_dictionary,
                     random_state=100,
                     alpha = 'asymmetric',
                     eta = 0.91,
                     num_topics=10)

In [None]:
# Create the CoherenceModel using the LDA model, the tokenized documents, and the dictionary
coherence_model = CoherenceModel(model=lda_model, texts=tokenized_docs, dictionary=gensim_dictionary, coherence='c_v')

# Get the coherence score
coherence_score = coherence_model.get_coherence()

# Print the coherence score
print('Coherence Score:', coherence_score)

In [None]:
# plot topics

def plot_top_words_gensim(lda_model, gensim_dictionary, n_top_words, title):
    fig, axes = plt.subplots(3, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    
    for topic_idx, topic in enumerate(lda_model.get_topics()):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [gensim_dictionary[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f'Topic {topic_idx + 1}', fontdict={'fontsize': 30})
        ax.invert_yaxis()
        ax.tick_params(axis='both', which='major', labelsize=20)
        for i in 'top right left'.split():
            ax.spines[i].set_visible(False)
    
    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    fig.suptitle(title, fontsize=40)
    plt.show()
    
# Plot the top words from each topic
plot_top_words_gensim(lda_model, gensim_dictionary, 10, 'Topics in LDA model')

In [None]:
LDAvis_prepared = gensimvis.prepare(lda_model, gensim_corpus, gensim_dictionary,mds='tsne')
LDAvis_prepared

In [None]:
# Sensitivity analysis with respect to topics
def compute_coherence_values(dictionary, corpus, texts, start, limit, step):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    start : Minimum number of topics to test
    limit : Maximum number of topics to test
    step : Step size for the number of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA models with respective number of topics
    """
    coherence_values = []
    model_list = []
    
    for num_topics in range(start, limit, step):
        model = LdaModel(corpus=gensim_corpus, 
                         id2word=gensim_dictionary,
                         random_state=100,
                         alpha = 0.01,
                         eta = 0.91,
                         num_topics=num_topics)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

def plot_coherence_sensitivity(start, limit, step, coherence_values):
    """
    Plot coherence scores against the number of topics

    Parameters:
    ----------
    start : Minimum number of topics to test
    limit : Maximum number of topics to test
    step : Step size for the number of topics
    coherence_values : Coherence values corresponding to the LDA models with respective number of topics
    """
    x = range(start, limit, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Number of Topics")
    plt.ylabel("Coherence score")
    plt.title("Coherence Score vs Number of Topics")
    plt.xticks(x)
    plt.show()

# Assuming you have the variables 'gensim_dictionary', 'gensim_corpus', and 'texts' already set up:

# Parameters for the sensitivity analysis
start = 1
limit = 11
step = 1

# Run the coherence value computation
model_list, coherence_values = compute_coherence_values(dictionary=gensim_dictionary, corpus=gensim_corpus, texts=tokenized_docs, start=start, limit=limit, step=step)

# Plot the coherence score sensitivity
plot_coherence_sensitivity(start, limit, step, coherence_values)

In [None]:
selected_model = model_list[4]

In [None]:
# Plot the top words from each topic
plot_top_words_gensim(selected_model, gensim_dictionary, 10, 'Topics in LDA model')

In [None]:
LDAvis_evaluate = gensimvis.prepare(selected_model, gensim_corpus, gensim_dictionary,mds='tsne')
LDAvis_evaluate

In [None]:
def explore_alpha_eta(dictionary, corpus, texts, num_topics, alpha_values, eta_values):
    """
    Explore LDA models with different values of alpha and eta while fixing the number of topics.
    
    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    num_topics : Fixed number of topics
    alpha_values : List of alpha values to explore
    eta_values : List of eta values to explore
    
    Returns:
    -------
    results : Dictionary with keys as tuples of (alpha, eta) and values as lists containing the model and coherence value
    best_model : The model with the highest coherence score
    """
    results = {}
    best_coherence = 0.0
    best_model = None
    best_params = (None, None)
    
    for alpha in alpha_values:
        for eta in eta_values:
            # Train LDA model
            model = LdaModel(corpus=corpus, 
                             id2word=dictionary, 
                             num_topics=num_topics, 
                             alpha=alpha, 
                             eta=eta, 
                             random_state=100)
            
            # Compute coherence score
            coherence_model = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
            coherence_score = coherence_model.get_coherence()
            
            # Update results
            results[(alpha, eta)] = [model, coherence_score]
            
            # Update best model
            if coherence_score > best_coherence:
                best_coherence = coherence_score
                best_model = model
                best_params = (alpha, eta)
    
    return results, best_model, best_params

# Example usage:
alpha_values = list(np.arange(0.01,1,0.1))
alpha_values.extend(['symmetric', 'asymmetric'])

eta_values = list(np.arange(0.01,1,0.1))
eta_values.extend(['symmetric'])
num_topics = 10  # Set this to the best number of topics found from your previous sensitivity analysis

# Run the function
results, best_model, best_params = explore_alpha_eta(gensim_dictionary, gensim_corpus, tokenized_docs, num_topics, alpha_values, eta_values)

# Display the best model
print(f"Best Model's Coherence Score: {results[best_params][1]}")
print(f"Best Model's Alpha: {best_params[0]}")
print(f"Best Model's Eta: {best_params[1]}")

In [None]:
best_model.alpha

In [None]:
# Initialize BERTopic model
topic_model = BERTopic(nr_topics=10)

# Fit the model to your data
topics, probabilities = topic_model.fit_transform(data_normalized)

# Explore the topics
for topic in topic_model.get_topic_info().head():
    print(topic)

In [None]:
type(topics)

In [None]:
len(topics)


In [None]:
topic_model.get_topic_info()