# Summarizing BERTopic Results

### Getting topics file

In [2]:
import pandas as pd
import ast

df = pd.read_csv("results/bertopic_topic_info.csv")

df['Representation'] = df['Representation'].apply(ast.literal_eval)
df['Representative_Docs'] = df['Representative_Docs'].apply(ast.literal_eval)

print(df.head())
print(df.dtypes)

   Topic  Count                            Name  \
0     -1   1708                -1_the_and_to_of   
1      0    306  0_flow_bedrock_groundwater_the   
2      1    178    1_land_exchange_forest_lands   
3      2    119         2_eis_final_project_the   
4      3     98          3_copper_nickel_we_our   

                                      Representation  \
0    [the, and, to, of, in, that, is, for, be, this]   
1  [flow, bedrock, groundwater, the, model, pit, ...   
2  [land, exchange, forest, lands, service, natio...   
3  [eis, final, project, the, and, environmental,...   
4  [copper, nickel, we, our, mining, and, mine, t...   

                                 Representative_Docs  
0  [I strongly oppose the PolyMet mine project. T...  
1  [Not only had the MDNR allowed the Northshore ...  
2  [WE object to the U.S. Forest Service proposal...  
3  [This EIS is the culmination of 10 years of st...  
4  [I oppose the current PolyMet NorthMet copper-...  
Topic                   in

### Summarizing topics

In [6]:
from transformers import pipeline
from tqdm import tqdm

tqdm.pandas()

print("Starting importing")
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
print("Finished importing")

def summarize_docs(docs):
    text = " ".join(docs)[:1024] 
    return summarizer(text, max_length=60, min_length=20, do_sample=False)[0]['summary_text']

df['Summary'] = df['Representative_Docs'].progress_apply(summarize_docs)

Starting importing


Device set to use cpu


Finished importing


 56%|█████▌    | 38/68 [02:02<01:41,  3.40s/it]Your max_length is set to 60, but your input_length is only 53. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=26)
100%|██████████| 68/68 [03:46<00:00,  3.33s/it]


In [7]:
df.to_csv("results/bertopic_summarized.csv", index=False)