In [None]:
import pandas as pd
import string
from octis.preprocessing.preprocessing import Preprocessing
from octis.models.LDA import LDA
import os

In [None]:
print("reading data...")

data_folder_path = os.path.join(os.getcwd(), "forum-crawler-data")

# read data
data_say_hello = pd.read_csv(os.path.join(data_folder_path, 'Say hello and introduce yourself.csv'))
data_recently_diagnosed = pd.read_csv(os.path.join(data_folder_path, 'Recently diagnosed and early stages of dementia.csv'))
data_memory_concerns = pd.read_csv(os.path.join(data_folder_path, 'Memory concerns and seeking a diagnosis.csv'))
data_i_have_dementia = pd.read_csv(os.path.join(data_folder_path, 'I have dementia.csv'))
data_i_have_partner = pd.read_csv(os.path.join(data_folder_path, 'I have a partner with dementia.csv'))
data_i_care = pd.read_csv(os.path.join(data_folder_path, 'I care for a person with dementia.csv'))


print("read data")

In [None]:
# combine data into single dataframe
dfs = [data_say_hello, data_recently_diagnosed, data_memory_concerns, data_i_have_dementia, data_i_have_partner, data_i_care]
forum_data_union = pd.concat(dfs, ignore_index=True)


SAMPLE_SIZE = 50

sample_data = forum_data_union.sample(SAMPLE_SIZE)

print("sampled")

In [None]:
# Save to TSV file

sample_data.to_csv(path_or_buf='/Users/vnarayan35/Documents/GitHub/PatientX.AI/existing_code/dataset/sample_data_final_fixed.tsv', index=False, sep='\t')

In [None]:
# group posts from the same forum/thread into one document and remove any line breaks
data_union_grouped = forum_data_union.groupby(['forum', 'thread_title'], as_index=False).agg({'post_message': ''.join})
data_union_grouped['post_message'] = data_union_grouped['post_message'].str.strip().replace(r'\n', ' ', regex=True)

In [None]:
# shape of grouped data
data_union_grouped.shape

In [None]:
# longest 'document' length
data_union_grouped['post_message'].str.len().max()

In [None]:
# brief look at what the aggregated data looks like
data_union_grouped.head(10)

In [None]:
!pip install textblob

In [None]:
import pandas as pd
import spacy
from textblob import TextBlob  # or use autocorrect

# Load SpaCy model
nlp = spacy.load('en_core_web_sm')

# Function to remove non-English words and correct typos
def clean_message(message):
    # Process the message with SpaCy
    doc = nlp(message)
    
    # Remove non-English words (based on SpaCy's 'lang' attribute)
    filtered_tokens = [token.text for token in doc if token.lang_ == 'en']
    
    # Reconstruct the sentence from the filtered tokens
    filtered_message = ' '.join(filtered_tokens)
    
    # Fix typos using TextBlob (or autocorrect)
    corrected_message = str(TextBlob(filtered_message).correct())
    
    return corrected_message


In [None]:
# sample 1000 documents from the data - makes for a more manageable dataset to work with
trimmed = data_union_grouped.sample(1000)

In [None]:
# clean the sampled data by fixing typos and removing non-english words
trimmed['cleaned_message'] = trimmed['post_message'].apply(clean_message)

In [None]:
# save this cleaned data to a text file where documents are separated by a line break
trimmed['cleaned_message'].to_csv(r'./trimmed_cleaned.txt', header=None, index=None, sep='\n', mode='a')

In [None]:
data_union_grouped['post_message'] = data_union_grouped['post_message'].str.strip().replace(r'\r', ' ', regex=True) # minor cleaning
data_review_grouped = data_union_grouped['post_message'] # holds only the text from the dataset

In [None]:
# save posts to a txt file, documents separated by linebreaks
data_review_grouped.to_csv(r'./corpus_threads_combined.txt', header=None, index=None, sep='\n', mode='a')

In [None]:
# First pass at preprocessing the entire dataset

print("preprocessing...")

# preprocessing - remove whitespace, remove punctuation, convert to lowercase
preprocessor = Preprocessing(vocabulary=None, max_features=None,
                             remove_punctuation=True, punctuation=string.punctuation,
                             lemmatize=True, stopword_list='english',
                             min_chars=1, min_words_docs=0)

dataset = preprocessor.preprocess_dataset(documents_path=r'./corpus.txt')
print("done preprocessing")

In [None]:
print("saving...")
dataset.save(path='./processed_dataset/')
print("done saving")

In [None]:
# Topic modeling with OCTIS LDA

model = LDA(num_topics=9)


print("training lda...")

# Train the model using default partitioning choice
output = model.train_model(dataset)

print("done training")

print(*list(output.keys()), sep="\n") # Print the output identifiers

for t in output['topics'][:5]:
  print(" ".join(t))

In [None]:
# print results again

for t in output['topics'][:20]:
  print(" ".join(t))

In [None]:
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence
# Initialize metric
npmi = Coherence(texts=dataset.get_corpus(), topk=10, measure='c_npmi')
# Initialize metric
topic_diversity = TopicDiversity(topk=10)

In [None]:
# Retrieve metrics score
topic_diversity_score = topic_diversity.score(output)
print("Topic diversity: "+str(topic_diversity_score))

npmi_score = npmi.score(output)
print("Coherence: "+str(npmi_score))

In [None]:
# second preprocessor, changes in max doc freq and min doc freq
print("preprocessing...")

# preprocessing - remove whitespace, remove punctuation, convert to lowercase
preprocessor2 = Preprocessing(vocabulary=None, max_features=None,
                             remove_punctuation=True, punctuation=string.punctuation,
                             lemmatize=True, stopword_list='english',
                             min_chars=1, min_words_docs=0, num_processes=10, min_df=0.0001, max_df=0.7)

In [None]:
# increase model max length
preprocessor2.spacy_model.max_length = 100000000

In [None]:
dataset2 = preprocessor2.preprocess_dataset(documents_path=r'./corpus_threads_combined.txt')
print("done preprocessing")

In [None]:
print("saving...")
dataset.save(path='./processed_dataset_2/')
print("done saving")

In [None]:
# remove custom stop words that aren't caught by spacy's model
from spacy.lang.en import stop_words

stop_words = list(stop_words.STOP_WORDS)
custom_stop_words = ['with', 'my', 'your', 'she', 'this', 'was', 'her', 'have', 'as', 'he', 'him', 'but', 'not', 'so', 'are', 'at', 'be', 'has', 'do', 'got', 'how', 'on', 'or', 'would', 'will', 'what', 'they', 'if', 'or', 'get', 'can', 'we', 'me', 'can', 'has', 'his', 'there', 'them', 'just', 'am', 'by', 'that', 'from', 'it', 'is', 'in', 'you', 'also', 'very', 'had', 'a', 'an', 'for']

stop_words += custom_stop_words

In [None]:
# third pass at preprocessing, changes in max df, min df, and using custom stop words
preprocessor3 = Preprocessing(vocabulary=None, max_features=None,
                             remove_punctuation=True, punctuation=string.punctuation,
                             lemmatize=True, stopword_list=custom_stop_words,
                             min_chars=1, min_words_docs=20, num_processes=10, min_df=0.01, max_df=0.5)

In [None]:
preprocessor3.spacy_model.max_length = 100000000

In [None]:
# check max doc length amongst the sampled datapoints
trimmed['cleaned_message'].str.len().max()

In [None]:
# preprocess the sampled data
dataset_trimmed_cleaned = preprocessor3.preprocess_dataset(documents_path=r'./trimmed_cleaned.txt')
print("done preprocessing")

In [None]:
# save dataset
print("saving...")
dataset_trimmed_cleaned.save(path='./processed_dataset_trimmed/')
print("done saving")

In [None]:
# NOTE THIS IS A CUSTOM LDA MODEL THAT BUILDS OFF THE OCTIS LDA - THIS ONE USES TF-IDF INSTEAD OF BAG OF WORDS

from octis.models.model import AbstractModel
import numpy as np
from gensim.models import ldamodel
import gensim.corpora as corpora
import octis.configuration.citations as citations
import octis.configuration.defaults as defaults
from gensim.models import TfidfModel


class LDA(AbstractModel):

    id2word = None
    id_corpus = None
    use_partitions = True
    update_with_test = False

    def __init__(
        self, num_topics=100, distributed=False, chunksize=2000,
        passes=1, update_every=1, alpha="symmetric", eta=None, decay=0.5,
        offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001,
            random_state=None):
        """
        Initialize LDA model

        Parameters
        ----------
        num_topics (int, optional) – The number of requested latent topics to
        be extracted from the training corpus.

        distributed (bool, optional) – Whether distributed computing should be
        used to accelerate training.

        chunksize (int, optional) – Number of documents to be used in each
        training chunk.

        passes (int, optional) – Number of passes through the corpus during
        training.

        update_every (int, optional) – Number of documents to be iterated
        through for each update. Set to 0 for batch learning, > 1 for
        online iterative learning.

        alpha ({numpy.ndarray, str}, optional) – Can be set to an 1D array of
        length equal to the number of expected topics that expresses our
        a-priori belief for the each topics’ probability. Alternatively
        default prior selecting strategies can be employed by supplying
        a string:

            ’asymmetric’: Uses a fixed normalized asymmetric prior of
            1.0 / topicno.

            ’auto’: Learns an asymmetric prior from the corpus
            (not available if distributed==True).

        eta ({float, np.array, str}, optional) – A-priori belief on word
        probability, this can be:

            scalar for a symmetric prior over topic/word probability,

            vector of length num_words to denote an asymmetric user defined
            probability for each word,

            matrix of shape (num_topics, num_words) to assign a probability
            for each word-topic combination,

            the string ‘auto’ to learn the asymmetric prior from the data.

        decay (float, optional) – A number between (0.5, 1] to weight what
        percentage of the previous lambda value is forgotten when each new
        document is examined.

        offset (float, optional) – Hyper-parameter that controls how much
        we will slow down the first steps the first few iterations.

        eval_every (int, optional) – Log perplexity is estimated every
        that many updates. Setting this to one slows down training by ~2x.

        iterations (int, optional) – Maximum number of iterations through the
        corpus when inferring the topic distribution of a corpus.

        gamma_threshold (float, optional) – Minimum change in the value of the
        gamma parameters to continue iterating.

        random_state ({np.random.RandomState, int}, optional) – Either a
        randomState object or a seed to generate one.s
        Useful for reproducibility.


        """
        super().__init__()
        self.hyperparameters = dict()
        self.hyperparameters["num_topics"] = num_topics
        self.hyperparameters["distributed"] = distributed
        self.hyperparameters["chunksize"] = chunksize
        self.hyperparameters["passes"] = passes
        self.hyperparameters["update_every"] = update_every
        self.hyperparameters["alpha"] = alpha
        self.hyperparameters["eta"] = eta
        self.hyperparameters["decay"] = decay
        self.hyperparameters["offset"] = offset
        self.hyperparameters["eval_every"] = eval_every
        self.hyperparameters["iterations"] = iterations
        self.hyperparameters["gamma_threshold"] = gamma_threshold
        self.hyperparameters["random_state"] = random_state

    def info(self):
        """
        Returns model informations
        """
        return {
            "citation": citations.models_LDA,
            "name": "LDA, Latent Dirichlet Allocation"
        }

    def hyperparameters_info(self):
        """
        Returns hyperparameters informations
        """
        return defaults.LDA_hyperparameters_info

    def set_hyperparameters(self, **kwargs):
        """
        Set model hyperparameters
        """
        super().set_hyperparameters(**kwargs)
        # Allow alpha to be a float in case of symmetric alpha
        if "alpha" in kwargs:
            if isinstance(kwargs["alpha"], float):
                self.hyperparameters["alpha"] = [
                    kwargs["alpha"]
                ] * self.hyperparameters["num_topics"]

    def partitioning(self, use_partitions, update_with_test=False):
        """
        Handle the partitioning system to use and reset the model to perform
        new evaluations

        Parameters
        ----------
        use_partitions: True if train/set partitioning is needed, False
                        otherwise
        update_with_test: True if the model should be updated with the test set,
                          False otherwise
        """
        self.use_partitions = use_partitions
        self.update_with_test = update_with_test
        self.id2word = None
        self.id_corpus = None

    def train_model(self, dataset, hyperparams=None, top_words=10):
        """
        Train the model and return output

        Parameters
        ----------
        dataset : dataset to use to build the model
        hyperparams : hyperparameters to build the model
        top_words : if greater than 0 returns the most significant words for
                    each topic in the output (Default True)
        Returns
        -------
        result : dictionary with up to 3 entries,
                 'topics', 'topic-word-matrix' and
                 'topic-document-matrix'
        """
        if hyperparams is None:
            hyperparams = {}

        if self.use_partitions:
            train_corpus, test_corpus = dataset.get_partitioned_corpus(
                use_validation=False)
        else:
            train_corpus = dataset.get_corpus()

        if self.id2word is None:
            self.id2word = corpora.Dictionary(dataset.get_corpus())

        if self.id_corpus is None:
            self.id_corpus = [self.id2word.doc2bow(document)
                              for document in train_corpus]

        if "num_topics" not in hyperparams:
            hyperparams["num_topics"] = self.hyperparameters["num_topics"]

        # Allow alpha to be a float in case of symmetric alpha
        if "alpha" in hyperparams:
            if isinstance(hyperparams["alpha"], float):
                hyperparams["alpha"] = [
                    hyperparams["alpha"]
                ] * hyperparams["num_topics"]

        #### changes #####
        print("using tf-idf")
        tfidf_model = TfidfModel(self.id_corpus)
        tfidf_corpus = tfidf_model[self.id_corpus]
        hyperparams["corpus"] = tfidf_corpus
        
        
        # hyperparams["corpus"] = self.id_corpus
        
        hyperparams["id2word"] = self.id2word
        self.hyperparameters.update(hyperparams)

        self.trained_model = ldamodel.LdaModel(**self.hyperparameters)

        result = {}

        result["topic-word-matrix"] = self.trained_model.get_topics()

        if top_words > 0:
            topics_output = []
            for topic in result["topic-word-matrix"]:
                top_k = np.argsort(topic)[-top_words:]
                top_k_words = list(reversed([self.id2word[i] for i in top_k]))
                topics_output.append(top_k_words)
            result["topics"] = topics_output

        result["topic-document-matrix"] = self._get_topic_document_matrix()

        if self.use_partitions:
            new_corpus = [self.id2word.doc2bow(
                document) for document in test_corpus]
            if self.update_with_test:
                self.trained_model.update(new_corpus)
                self.id_corpus.extend(new_corpus)

                result["test-topic-word-matrix"] = (
                    self.trained_model.get_topics())

                if top_words > 0:
                    topics_output = []
                    for topic in result["test-topic-word-matrix"]:
                        top_k = np.argsort(topic)[-top_words:]
                        top_k_words = list(
                            reversed([self.id2word[i] for i in top_k]))
                        topics_output.append(top_k_words)
                    result["test-topics"] = topics_output

                result["test-topic-document-matrix"] = (
                    self._get_topic_document_matrix())

            else:
                test_document_topic_matrix = []
                for document in new_corpus:
                    document_topics_tuples = self.trained_model[document]
                    document_topics = np.zeros(
                        self.hyperparameters["num_topics"])
                    for single_tuple in document_topics_tuples:
                        document_topics[single_tuple[0]] = single_tuple[1]

                    test_document_topic_matrix.append(document_topics)
                result["test-topic-document-matrix"] = np.array(
                    test_document_topic_matrix).transpose()
        return result

    def _get_topics_words(self, topk):
        """
        Return the most significative words for each topic.
        """
        topic_terms = []
        for i in range(self.hyperparameters["num_topics"]):
            topic_words_list = []
            for word_tuple in self.trained_model.get_topic_terms(i, topk):
                topic_words_list.append(self.id2word[word_tuple[0]])
            topic_terms.append(topic_words_list)
        return topic_terms

    def _get_topic_document_matrix(self):
        """
        Return the topic representation of the
        corpus
        """
        doc_topic_tuples = []
        for document in self.id_corpus:
            doc_topic_tuples.append(
                self.trained_model.get_document_topics(document,
                                                       minimum_probability=0))

        topic_document = np.zeros((
            self.hyperparameters["num_topics"],
            len(doc_topic_tuples)))

        for ndoc in range(len(doc_topic_tuples)):
            document = doc_topic_tuples[ndoc]
            for topic_tuple in document:
                topic_document[topic_tuple[0]][ndoc] = topic_tuple[1]
        return topic_document

In [None]:
# train custom model on sampled data

model_trim = LDA(num_topics=6)


print("training lda...")

# Train the model using default partitioning choice
output = model_trim.train_model(dataset_trimmed_cleaned, top_words=50)

print("done training")

print(*list(output.keys()), sep="\n") # Print the output identifiers

for t in output['topics'][:20]:
  print(" ".join(t))

In [None]:
# print results
for t in output['topics']:
  print(" ".join(t))
  print("\n")

In [None]:
# TWO FUNCTIONS FOR POST-PROCESSING

from typing import List

# take the outputted lists and return only words that are unique to their respective lists
def unique_words(lists: List[List[str]]) -> List[List[str]]:
    # Convert each sublist to a set for easier manipulation
    sets = [set(sublist) for sublist in lists]
    unique_lists = []

    for i, word_set in enumerate(sets):
        # Calculate the union of all sets except the current one
        other_words = set().union(*[s for j, s in enumerate(sets) if j != i])
        # Find words unique to the current set
        unique_lists.append(list(word_set - other_words))

    return unique_lists


# take outputted words and keep only words that show up in less than <threshold> percent of topics
def unique_words_threshold(lists: List[List[str]], threshold: float) -> List[List[str]]:
     # Total number of lists
    num_lists = len(lists)
    # Calculate the threshold count based on the percentage
    max_allowed_count = int(threshold * num_lists)
     
     
    # Convert each sublist to a set for easier manipulation
    sets = [set(sublist) for sublist in lists]
    unique_lists = []
    
    for i, word_set in enumerate(sets):
        new_set = []
        for word in word_set:
            other_counts = 0
            for j, other_set in enumerate(sets):
                if word in other_set and i != j:
                    other_counts += 1
        
            if other_counts <= max_allowed_count:
                new_set.append(word)
        unique_lists.append(new_set)

    return unique_lists

In [None]:
# print unique words
for l in unique_words(output['topics']):
    print(" ".join(l))
    print("\n")

In [None]:
# print 20 percent threshold results
unique_threshold_20 = unique_words_threshold(output['topics'], 0.2)

for l in unique_threshold_20:
    print(" ".join(l))
    print("\n")