# Summarizing BERTopic Results

### Getting topics file

In [3]:
import pandas as pd

df = pd.read_csv("combined_responses.csv")
df.columns = df.columns.str.strip()
grouped_comments = df.groupby("Issue")["Comment"].apply(lambda comments: " ".join(comments.astype(str)))
print(grouped_comments.shape)
grouped_comments.head()
grouped_comments.to_csv("grouped_comments_test.csv", index=False)

(28,)


### Summarizing topics

In [6]:
from transformers import pipeline
from tqdm import tqdm

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

summaries = []
counts = df["Issue"].value_counts().to_dict()

for issue_id, all_comments in tqdm(grouped_comments.items(), desc="Summarizing issues"):
    try:
        input_text = all_comments[:4000]
        summary = summarizer(input_text, do_sample=False)[0]['summary_text']
    except:
        summary = "[Summary Failed]"
    summaries.append((issue_id, counts.get(issue_id, 0), summary))

summary_df = pd.DataFrame(summaries, columns=["Issue", "Count", "Summary"])
summary_df = summary_df.sort_values(by="Count", ascending=False)
summary_df.to_csv("issue_summaries.csv", index=False)


Device set to use cpu
Summarizing issues: 6it [00:45,  7.57s/it]Your max_length is set to 142, but your input_length is only 132. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=66)
Summarizing issues: 16it [01:56,  7.02s/it]Your max_length is set to 142, but your input_length is only 127. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=63)
Summarizing issues: 28it [03:26,  7.38s/it]


In [7]:
summary_df.to_csv("results/issues_summarized.csv", index=False)