In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re #regular expression
import spacy
from matplotlib import pyplot as plt
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis, pyLDAvis.lda_model

from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.matutils import Sparse2Corpus
from gensim.models.coherencemodel import CoherenceModel

from bertopic import BERTopic

from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import pipeline as hf_pipeline

import joblib

from custom_package.text_processing import normalize_text, tokenizer_func, remove_emojis
from custom_package.modeling import GensimLdaTransformer, get_topic_assignment
from custom_package.modeling import topic_mapping_sk_lda, topic_mapping_gensim_lda
from custom_package.database import get_raw_tweets, store_processed_tweets,get_training_raw_tweets






In [None]:
# nltk.download('punkt')
# nltk.download('wordnet')


In [None]:
pyLDAvis.enable_notebook()

In [None]:
set_config(display='diagram')

Prepare dataset

In [None]:
query_limit = 484000


In [None]:
def get_filtered_tweets(query_limit = 100):
    raw_tweets = get_training_raw_tweets(query_limit)
    data = {'id' : [tweet.id for tweet in raw_tweets],
        'text' : [remove_emojis(tweet.text) for tweet in raw_tweets],
        'company_id' : [tweet.company_id for tweet in raw_tweets],
        'date' : [tweet.date for tweet in raw_tweets]
        }
    filtered_df = pd.DataFrame(data)
    return filtered_df

In [None]:
# get raw tweets for training
filtered_df = get_filtered_tweets(query_limit)

In [None]:
filtered_df.head()

In [None]:
filtered_df.info()

In [None]:
filtered_df['company_id'].value_counts()

In [None]:
nlp = spacy.load("en_core_web_sm")

def normalize_text(documents,
                   min_token_len=1,
                   irrelevant_pos=['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE']):
    """
    Given text, min_token_len, and irrelevant_pos carry out preprocessing of the text
    and return a preprocessed string.

    Keyword arguments:
    documents -- (np.array[str]) the list of documents to be preprocessed
    min_token_len -- (int) min_token_length required
    irrelevant_pos -- (list) a list of irrelevant pos tags

    Returns: np.array[str] the normalized documents
    """
    normalized_documents = []

    for text in documents:
        #print(text)
        # Remove Emails
        text = re.sub(r'\S*@\S*\s?', '', text)

        # Remove extra space characters
        text = re.sub(r'\s+', ' ', text)

        # Remove distracting characters
        text = re.sub(r'''[\*\~]+''', "", text)

        doc = nlp(text) #covert text into spacy object
        clean_text = []

        for token in doc:
            if (token.is_stop == False # Check if it's not a stopword
                and token.is_alpha # Check if it's an alphanumerics char
                and len(token) > min_token_len # Check if the word meets minimum threshold
                and token.pos_ not in irrelevant_pos): # Check if the POS is in the acceptable POS tags
                lemma = token.lemma_ # Take the lemma of the word
                clean_text.append(lemma)

        clean_text = ' '.join(clean_text) #merge list of tokens back into string
        normalized_documents.append(clean_text) #append to list of normalized documents

    normalized_documents = np.array(normalized_documents) #convert list of normalized documents into numpy array
    return normalized_documents

# Create a Transformer from the function so that we can use it in a Pipeline
normalizer = FunctionTransformer(normalize_text)

In [None]:
# Get the built-in stopword list
builtin_stopwords = list(ENGLISH_STOP_WORDS)
# Your custom stopwords
custom_stopwords = ['una', 'dey', 'come','dm','pls','guy',
                    'hi','try','hello','god','gtb','gtbank','nigeria','till','gt',
                   'send','month','week','day','february','don','useless','want',
                    'people','know','abeg']
#new_stopwords = ['ment','uba','access','bad','beg','good','morning',
#                 'yesterday','zenith','firstbank','new','use','youfirst','year']
new_stopwords = ['customer','service','bank','ment',
'uba','access','bad','beg','need','good','morning',
'yesterday','zenith','firstbank','new','use',
'youfirst','money','help','dear','ur','na','naira','think',
'thank','person','tell','respond','like','wait','time','attend','say','treat','today',
'ooo','thing','life','happen','happy','africa','business','start','win','way','year','hour','ask']
# Combine the stopword lists
all_stopwords = builtin_stopwords + custom_stopwords + new_stopwords

In [None]:
len(all_stopwords)

In [None]:
n_features = 5000*2

#keep 5000 most common tokens that appear in atleast 2 documents, less than 95% of documents
#notice binary=False by default
vectorizer = CountVectorizer(min_df=100, max_df=0.95, 
                             max_features=n_features,
                             ngram_range=(1, 3),
                             stop_words=all_stopwords)

In [None]:
preprocessor = Pipeline([('normalizer', normalizer),
                         ('vectorizer', vectorizer)])

In [None]:
n_topics = 10

#alpha = doc_topic_prior = 1 / n_components (every topic is equally likely in a document)
#eta = topic_word_prior = 1 / n_components (every word is equally likely in a topic)
lda = LatentDirichletAllocation(n_components=n_topics,
                                max_iter=10,
                                doc_topic_prior = 0.01,
                                topic_word_prior = 0.01,
                                learning_method='batch',
                                random_state=27,
                               verbose = 1,
                               n_jobs=-1)

pipeline = Pipeline([('preprocessor', preprocessor),
                     ('model', lda)])

In [None]:
pipeline.fit(filtered_df['text'])
print("done")

In [None]:
filtered_df['text'].iloc[0:2]

In [None]:
[print(max(range(len(topic)), key=topic.__getitem__)) for topic in pipeline.transform(filtered_df['text'].iloc[0:10])]

In [None]:
n_topics

In [None]:
print(pipeline)

In [None]:
def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f'Topic {topic_idx +1}',
                     fontdict={'fontsize': 30})
        ax.invert_yaxis()
        ax.tick_params(axis='both', which='major', labelsize=20)
        for i in 'top right left'.split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)

plot_top_words(lda, vectorizer.get_feature_names_out(), 10, 'Topics in LDA model')

In [None]:
data_vectorized = preprocessor.transform(filtered_df['text'])
print("done")

In [None]:
vis = pyLDAvis.lda_model.prepare(lda, data_vectorized, vectorizer, mds='pcoa',sort_topics=False)
vis

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

def get_top_words(model, feature_names, n_top_words):
    top_words = {}
    for topic_idx, topic in enumerate(model.components_):
        # Get top words indices for the topic
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        # Get the actual words
        top_words[f'Topic {topic_idx + 1}'] = [feature_names[i] for i in top_features_ind]
    return top_words

def create_word_clouds(top_words):
    for topic, words in top_words.items():
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(words))
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.title(f'Word Cloud for {topic}')
        plt.axis('off')
        plt.show()

# Example usage
# lda is your trained LDA model
# feature_names are obtained from your vectorizer, e.g., vectorizer.get_feature_names_out()
top_words = get_top_words(lda, vectorizer.get_feature_names_out(), 20)



In [None]:
#joblib.dump(top_words,'top_words.joblib')

In [None]:
create_word_clouds(top_words)

In [None]:
#joblib.dump(pipeline,'full_lda_pipeline.joblib')