# BERTopic
Using BERTopic to identify topics in dementia forum text

## Data Setup
Read data into a list where each document is an item in the list

In [None]:
# Read documents from the file
# corpus_threads_combined.txt contains all dementia forum data
# Each thread in the forum is represented as a document and separated by a new line

with open('../data/corpus_threads_combined.txt', 'r', encoding='utf-8') as file:
    documents = file.read().split('\n')  # Split on newline to get individual documents

In [None]:
# install bertopic

!pip install bertopic

## Approach 1: 
- **Embedding Model:** [all-MiniLM-L6-v2 Sentence Transformer](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
- **Dimensionality Reduction:** UMAP
- **Clustering:** HDBScan
- **Tokenizer:** *None*
- **Weighting Scheme:** c-TF-IDF
- **Representation Tuning:** *none*

In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# Initialize a sentence transformer model for embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Create a BERTopic model
topic_model = BERTopic(embedding_model=embedding_model, verbose=True)

# Fit the model on the documents
topics, probs = topic_model.fit_transform(documents)

In [None]:
# Show results and intertopic distance map visualization
print(topic_model.get_topic_info())
topic_model.visualize_topics()

### Clean up data
Remove some custom stop words

In [None]:
# remove custom stop words that aren't caught by spacy's model
from spacy.lang.en import stop_words

stop_words = list(stop_words.STOP_WORDS)
custom_stop_words = ['with', 'my', 'your', 'she', 'this', 'was', 'her', 'have', 'as', 'he', 'him', 'but', 'not', 'so', 'are', 'at', 'be', 'has', 'do', 'got', 'how', 'on', 'or', 'would', 'will', 'what', 'they', 'if', 'or', 'get', 'can', 'we', 'me', 'can', 'has', 'his', 'there', 'them', 'just', 'am', 'by', 'that', 'from', 'it', 'is', 'in', 'you', 'also', 'very', 'had', 'a', 'an', 'for']

stop_words += custom_stop_words

## Approach 2: 
- **Embedding Model:** [all-MiniLM-L6-v2 Sentence Transformer](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
- **Dimensionality Reduction:** UMAP
- **Clustering:** HDBScan
- **Tokenizer:** CountVectorizer
- **Weighting Scheme:** c-TF-IDF
- **Representation Tuning:** *none*

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_model = CountVectorizer(stop_words=custom_stop_words)
topic_model_2 = BERTopic(vectorizer_model=vectorizer_model, embedding_model=embedding_model, verbose=True)