##### README
*Code as written works for the dataframe called alltrails_sample.csv from https://github.com/danidagan/trail-reports-supplemental*
- In this script, a topic model is run using the BERTopic architecture from https://maartengr.github.io/BERTopic
- We run the topic model without reducting topics first then reduce topics automatically and manually, checking model output after each reduction
- This script saves a topic model prior to topic reduction, a CSV with top terms for each topic, and a CSV with topic-document probabilities and metadata for each review
- We imported CSVs into R for remaining anlysis

**IMPORT DATA**

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv(r'alltrails_sample.csv') # file from github

# Remove excess columns
data.rename(columns = {'Unnamed: 0': 'review_id'}, inplace = True) 
data = data.drop('X',axis = 1)

# Remove AllTrails review tags
data = data[data['review_string'] != 'Advertisement'] 
data = data[data['review_string'] != 'First to review']

# Convert numeric values
data['trail_elevchange'] = data['trail_elevchange'].replace(',','', regex=True)
data['trail_elevchange'] = pd.to_numeric(data['trail_elevchange'])
data['trail_elevchange'] = data['review_string'].replace('trail','', regex=True)
data['trail_elevchange'] = data['review_string'].replace('hike','', regex=True)

# Replace "trial" typo
data['review_string'] = data['review_string'].replace('trial','trail', regex=True)

# Check Dtypes
print(data.info())

# Set seed

**TOPIC MODEL**

In [None]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from bertopic.representation import MaximalMarginalRelevance
from bertopic import BERTopic

*Submodels*

In [None]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# There's a tip in the BERTopic documentation that Gensim works best for short documnets
# More on the pretrained embedding model: https://www.sbert.net/docs/pretrained_models.html

vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
# Two-token topics to account for concepts captured by two words, e.g., parking lot
# Default min_df/max_features because dataset is relatively small
# Default tokenizer works well for western languages according to BERTopic documentation: https://maartengr.github.io/BERTopic/getting_started/vectorizers/vectorizers.html

representation_model = MaximalMarginalRelevance(diversity=0.3)
# By default BERTopic uses class-based TF-IDF for representation, this model uses MMR to diversify terms

# I used BERTopic's default UMAP dimensionality reduction paramters
# I used BERTopic's default HDBSCAN clustering parameters

*Model*

In [None]:
topic_model = BERTopic(
    representation_model=representation_model,
    vectorizer_model=vectorizer_model,
    embedding_model=embedding_model,
    language='english', calculate_probabilities=True,
    verbose=True,
   )

In [None]:
topics, probs = topic_model.fit_transform(data['review_string'])

In [None]:
# Saving model with safetensors
savelocation = "" # add file path
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
topic_model.save(savelocation, serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)

In [None]:
topic_model = BERTopic.load(savelocation)

**OUTPUTS**

*View outputs*
<br> - rerun this section after ever iteration of topic reduction to see how output changes

In [None]:
# Retrieve top words
saveto = "" # add file path .csv
freq = topic_model.get_topic_info()
freq.head(30)
freq.to_csv(saveto)

In [None]:
# View intertopic distance map
topic_model.visualize_topics()

In [None]:
# Return number of topics
len(freq)

In [None]:
# View hierarchical clustering (tree diagram)
topic_model.visualize_hierarchy()

In [None]:
# Top word scores (bar graph)
topic_model.visualize_barchart()

**TOPIC REDUCTION**

In [None]:
from bertopic import BERTopic

In [None]:
# Reduce the number of topics automatically, uses intertopic distance
topic_model.reduce_topics(data['review_string'], nr_topics='auto')

In [None]:
# Manually reduce after reviewing topics for similarity
topics_to_merge = [[47, 31], # these combinations to merge were identified by viewing results after automatic topic reduction
                   [40, 25],
                   [33, 41],
                   [24, 9, 5],
                   [23, 1]]
topic_model.merge_topics(docs, topics_to_merge)

**CONSTRUCT REVIEW-LEVEL CSV**

*Extracting TD probabilities*

In [None]:
docs_dist = topic_model.approximate_distribution(data['review_string'],batch_size=500)

In [None]:
# View the number of reviews to check for issues
print(len(data['review_string']))

In [None]:
# Create df
array_list = docs_dist[0] # extract array
df = pd.DataFrame(array_list) # Create a DataFrame from the list of lists
print(df) # View (check for issues)

In [None]:
# Reset indexing to combine dataframes
data.reset_index(drop=True, inplace=True)

In [None]:
print(data.info()) # Check for indexing

In [None]:
print(df.info()) # Check for indexing

*Merge dataframes*

In [None]:
merged_df = pd.concat([df, data], axis=1)

In [None]:
# Save dataframe as csv
saveto = "" # add file path
merged_df.to_csv(saveto)

**Check coherence**

In [None]:
import gensim.corpora as corpora

documents = pd.DataFrame({"Document": data['review_string'],
                          "ID": range(len(data['review_string'])),
                          "Topic": topics})

documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

# Extract vectorizer and analyzer from BERTopic
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names_out()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
               for topic in range(len(set(topics))-1)]

In [None]:
from gensim.models.coherencemodel import CoherenceModel

coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='c_v')
coherence = coherence_model.get_coherence()

In [None]:
print(coherence)