## 5. Topic Modeling with Gensim LdaModel

### Import packages

In [None]:
import pandas as pd
from tqdm.notebook import tqdm
import gensim
from gensim.parsing.preprocessing import STOPWORDS
from nltk.corpus import stopwords
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from nltk.tokenize import RegexpTokenizer
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from gensim.models import Phrases

### Create a list of stopwords

In [None]:
stopwords = stopwords.words('english')
stopwords = STOPWORDS.union(set(stopwords))
stopwords = stopwords.union(set(['tell','come','lot','took','youre','thats','got','said','im','maybe','mr','he','oh','today','let','amp', 'need', 'know', 'going', 'think', 'want', 'year', 'day', 'time', 'dont', 'thing']))

### LDA modeling and visulization

In [None]:
startdate =pd.to_datetime('2022-02-24').date()
keywords = ['ukraine', 'russia', 'eu', 'zelenskyy', 'biden', 'putin', 'johnson', 'nato', 'scholz', 'macron']

for keyword in tqdm(keywords):
    df = pd.read_csv(f'../data/tweets_en/tweets_{keyword}_en.csv',dtype={'date':'str'}, parse_dates = ['date'], lineterminator='\n', encoding='latin-1')
    df['date'] = pd.DatetimeIndex(df['date']).date
    df = df[df['date']>startdate].sample(10000)

    docs = df['text'].tolist()

    # Split the documents into tokens.
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = docs[idx].lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

    # Remove numbers, but not words that contain numbers.
    docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

    # Remove words that are only one character.
    docs = [[token for token in doc if len(token) > 1] for doc in docs]

    # Remove stopwords
    docs = [[token for token in doc if token not in stopwords] for doc in docs]

    # Lemmatize the documents.
    from nltk.stem.wordnet import WordNetLemmatizer

    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

    # Compute bigrams.

    # Add bigrams and trigrams to docs (only ones that appear 20 times or more).
    bigram = Phrases(docs, min_count=5)
    for idx in range(len(docs)):
        for token in bigram[docs[idx]]:
            if '_' in token:
                # Token is a bigram, add to document.
                docs[idx].append(token)

    # Remove rare and common tokens.

    # Create a dictionary representation of the documents.
    dictionary = Dictionary(docs)

    # Filter out words that occur less than 20 documents, or more than 50% of the documents.
    dictionary.filter_extremes(no_below=20, no_above=0.5)

    # Bag-of-words representation of the documents.
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    ## Latent Dirichlet Allocation

    # Train LDA model.
   
    # Set training parameters.
    num_topics = 5
    chunksize = 100
    passes = 10
    iterations = 50
    eval_every = None  # Don't evaluate model perplexity, takes too much time.

    # Make an index to word dictionary.
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token

    model = LdaModel(
        corpus=corpus,
        id2word=id2word,
        chunksize=chunksize,
        alpha='auto',
        eta='auto',
        iterations=iterations,
        num_topics=num_topics,
        passes=passes,
        eval_every=eval_every,
        random_state=321
    )

    vis = gensimvis.prepare(model, corpus, dictionary, sort_topics=False)
    pyLDAvis.save_html(vis, f'../shinyapp/www/{keyword}.html')