In [1]:
import numpy as np
import pandas as pd
import os
import shutil
import json
import gensim
from gensim import corpora, models
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm

In [None]:
def load_processed_text(file_name):
    '''
    load processed documents back
    '''
    text_list = []

    with open(file_name, 'r', encoding='utf-8') as file:
        for line in file:
            text_list.append(line.strip().split())

    return text_list

In [None]:
def generate_random_seeds(num_seeds=10):
    '''
    generate n random seeds
    '''
    seeds = np.random.randint(low=0, high=10000, size=num_seeds).tolist()
    
    return seeds

In [16]:
def tune_lda_ind(texts, corpus, id2word, num_topics, seed, grid):
    '''
    tune lda for each num_topics and seeds using grid search
    '''
    best_model = None
    best_score = float('-inf')
    best_alpha = None
    best_eta = None

    total_iterations = len(grid['alpha']) * len(grid['eta'])
    progress_bar = tqdm(total=total_iterations, desc="Tuning LDA")

    for alpha in grid['alpha']:
        for eta in grid['eta']:
            lda_model = gensim.models.ldamodel.LdaModel(
                                corpus=corpus,
                                id2word=id2word,
                                num_topics=20,
                                random_state=seed,
                                chunksize=1000,
                                passes=20,
                                iterations=100,
                                update_every = 1,
                                alpha=alpha,
                                eta=eta,  
                                eval_every=None)
            # Evaluate model (using coherence)
            top_words = [[word for word, _ in lda_model.show_topic(i, topn=50)] for i in range(10)]
            coherence_model = models.CoherenceModel(topics=top_words, texts=texts, dictionary=id2word, coherence='c_v')
            score = coherence_model.get_coherence()
            # Update best model if the current model is better
            if score > best_score:
                best_score = score
                best_model = lda_model
                best_alpha = alpha
                best_eta = eta

            progress_bar.update(1)

    progress_bar.close()
    
    return best_model, best_score, best_alpha, best_eta

In [None]:
def prepare_gensim_tests(seed, lda_model):
    '''
    generate intrusion tests for a lda model
    '''
    # get the number of topics 
    num_topics = lda_model.num_topics
    
    topic_words = []
    intrusion_tests = {}

    # Get the top 50 words for each topic - list of lists
    for t in range(num_topics):
        topic_terms = lda_model.show_topic(t, topn=50)
        words = [word for word, _ in topic_terms]
        topic_words.append(words)  # Add words to topic_words

    # Create Intrusion Tests
    for i, words in enumerate(topic_words):
        # Randomly sample 5 words from the top 10 words of the topic
        np.random.seed(seed)
        sample_words = np.random.choice(words[:10], 5, replace=False).tolist()
        top_ten_words = words[:10]
        
        # Prepare list of other topics to explore for intruder word
        other_topics = list(range(num_topics))
        other_topics.remove(i)  # Remove the current topic
        
        # Ensure the intruder word is not in the top 50 words of the current topic
        intruder_word = None
        while other_topics and intruder_word is None:
            np.random.seed(seed)
            intruder_topic = np.random.choice(other_topics)
            other_topics.remove(intruder_topic)  # Remove the explored topic
            
            # Attempt to find a suitable intruder word
            for word in topic_words[intruder_topic]:
                if word not in words:
                    intruder_word = word
                    break
                    
        # Save the sample and the intruder into a dictionary
        intrusion_tests[i] = {'top 5 sample': sample_words, 
                              'intruder': intruder_word}

    return intrusion_tests

In [None]:
def tune_lda_grid(texts, corpus, id2word, num_topics, seeds, grid):
    '''
    tune lda for each num_topics and seeds using grid search
    '''
    results = []

    total_iter = len(num_topics)*len(seeds)*len(grid['alpha']) * len(grid['eta'])
    progress_bar = tqdm(total=total_iter, desc="Total progress")
    
    for k in num_topics:
        
        for seed in seeds:
            
            # tune lda
            best_model = None
            best_score = float('-inf')
            best_alpha = None
            best_eta = None
            
            for alpha in grid['alpha']:
                for eta in grid['eta']:
                    model = models.ldamodel.LdaModel(
                                corpus=corpus,
                                id2word=id2word,
                                num_topics=k,
                                random_state=seed,
                                chunksize=1000,
                                passes=10,
                                iterations=50,
                                update_every = 1,
                                alpha=alpha,
                                eta=eta,  
                                eval_every=None)
                    
                    top_words = [[word for word, _ in model.show_topic(i, topn=50)] for i in range(k)]
                    coherence_model = models.CoherenceModel(topics=top_words, texts=texts, dictionary=id2word, coherence='c_v')
                    score = coherence_model.get_coherence()
                        
                    if score > best_score:
                        best_score = score
                        best_model = model
                        best_alpha = alpha
                        best_eta = eta
              
                    progress_bar.update(1)
            
            
            # make tests
            tests = prepare_gensim_tests(seed, best_model)
            
            results.append({
                    'num_topics': k,
                    'seed': seed,
                    'score': best_score,
                    'alpha': best_alpha,
                    'eta': best_eta,
                    'tests': tests})
    
    progress_bar.close()
   
    df = pd.DataFrame(results)
    return df

In [None]:
def df_to_csv(df, file_path):
    '''saves df to local csv'''
    df.to_csv(file_path, index=False)