## Import Libraries, Data, and Filter by Response Length

In [1]:
import pandas as pd
import string
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

file_name = "file"

data = pd.read_excel( file_name + ".xlsx")
data['text_column'] = data['text_column'].astype(str)
data = data[data['text_column'].str.len() > 2].reset_index()

## Run BERTopic Model on Responses, Print Categories

In [2]:
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

model = BERTopic(
    vectorizer_model=vectorizer_model,
    language='english', calculate_probabilities=True,
    verbose=True
)
topics, probs = model.fit_transform(data['text_column'])

freq = model.get_topic_info()
freq

Batches:   0%|          | 0/14 [00:00<?, ?it/s]

2024-03-14 09:37:14,256 - BERTopic - Transformed documents to Embeddings
2024-03-14 09:37:29,597 - BERTopic - Reduced dimensionality
2024-03-14 09:37:29,709 - BERTopic - Clustered reduced embeddings


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,11,-1_good_good good_great presentation_review good,"[good, good good, great presentation, review g...","[GOOD, Good, Good]"
1,0,305,0_obesity_weight_patients_medications,"[obesity, weight, patients, medications, loss,...","[More on obesity, Obesity and contraception/fa..."
2,1,32,1_xxx_ty nash_ty_nash xxx,"[xxx, ty nash, ty, nash xxx, nash, , , , , ]","[NASH, xxx, any ty]"
3,2,31,2_nan_nan nan_nan na_na,"[nan, nan nan, nan na, na, , , , , , ]","[nan, nan, nan]"
4,3,19,3_nil_nil nil_nill nil_nill,"[nil, nil nil, nill nil, nill, eee, eee nill, ...","[nil, NIL, NIL]"
5,4,19,4_time_time time_time think_think,"[time, time time, time think, think, time dont...","[None at this time, None at this time., none a..."
6,5,16,5_yes_sure_yes yes_say dont,"[yes, sure, yes yes, say dont, hard say, say, ...","[yes, Yes, yes]"
7,6,12,6_comment_comment comment_practice_great content,"[comment, comment comment, practice, great con...","[no comment, No comment, no comment]"


## Filter Responses by Topic "0" and rerun BERTopic Categorization, Print Categories

In [3]:
data['Topic'] = model.topics_
topic_0_df = data.loc[data['Topic'] == 0]

In [4]:
model0 = BERTopic(
    vectorizer_model=vectorizer_model,
    language='english', calculate_probabilities=True,
    verbose=True
)
topics0, probs0 = model0.fit_transform(topic_0_df['text_column'])

Batches:   0%|          | 0/10 [00:00<?, ?it/s]

2024-03-14 09:37:33,644 - BERTopic - Transformed documents to Embeddings
2024-03-14 09:37:37,673 - BERTopic - Reduced dimensionality
2024-03-14 09:37:37,720 - BERTopic - Clustered reduced embeddings


In [5]:
freq0 = model0.get_topic_info()
freq0

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,54,-1_na_covid_insurance_complications,"[na, covid, insurance, complications, covid re...","[COVID RELATED COMPLICATIONS., I would like to..."
1,0,73,0_cardiology_patient_sleep apnea_apnea,"[cardiology, patient, sleep apnea, apnea, orth...","[Orthopedics, cardiology, diabetes, Conversati..."
2,1,65,1_obesity_patients_weight_obesity obesity,"[obesity, patients, weight, obesity obesity, s...","[obesity, More on obesity, Obesity and contrac..."
3,2,32,2_new_medications_medication_new medications,"[new, medications, medication, new medications...","[continue updates n new meds, updates on new m..."
4,3,26,3_diabetes_insulin_medications diabetes_medica...,"[diabetes, insulin, medications diabetes, medi...","[The Educational Impact of Web-Based, Faculty-..."
5,4,20,4_diet_nutrition_exercise_education,"[diet, nutrition, exercise, education, diets, ...",[Diet and exercise plan for mid to late age pa...
6,5,18,5_weight_loss_weight loss_weight management,"[weight, loss, weight loss, weight management,...",[I would love to learn more about weight loss ...
7,6,17,6_bariatric_bariatric surgery_surgery_complica...,"[bariatric, bariatric surgery, surgery, compli...","[complications of bariatric surgery, Bariatric..."


## Overwrite Topics in Filtered Responses with New Topic Tags, Names (To Do)

In [6]:
topic_0_df['Topic'] = model0.topics_

## Export Report as Excel File

In [7]:
with pd.ExcelWriter( file_name + "_analysis.xlsx") as writer:
    topic_0_df.to_excel(writer, sheet_name="Filtered_Responses", index=True)
    freq0.to_excel(writer, sheet_name="Categories_Overview", index=False)