# Processing public comments

### Formatting / setting up the data

In [3]:
import tabula
import pandas as pd

pdf_path = "Public comments - Northmet.pdf"
tables = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True, lattice=True)
tables_no_headers = [df.iloc[1:].copy() for df in tables if len(df) > 1]
df_all = pd.concat(tables_no_headers, ignore_index=True)

for col in df_all.select_dtypes(include='object').columns:
    df_all[col] = df_all[col].map(lambda x: str(x).replace('\r', ' ').replace('\n', ' ').strip() if pd.notnull(x) else x)

df_all.columns = [
    "Name of Sender",
    "Comment",
    "Issue",
    "Substantive / Non-Substantive",
    "Old / New",
    "Response ID",
    "RGU Consideration"
][:len(df_all.columns)]

rows_to_drop = []
for i in range(1, len(df_all)):
    name = df_all.iloc[i, 0]
    if pd.isna(name) or str(name).strip() == '':
        prev_comment = str(df_all.at[i - 1, "Comment"]).strip()
        curr_comment = str(df_all.at[i, "Comment"]).strip()
        df_all.at[i - 1, "Comment"] = prev_comment + ' ' + curr_comment
        rows_to_drop.append(i)

df_all.drop(index=rows_to_drop, inplace=True)
df_all.reset_index(drop=True, inplace=True)

df_all["Comment"] = df_all["Comment"].astype(str).map(lambda x: x.strip()).replace("", pd.NA)
df_all = df_all.dropna(subset=["Comment"])

df_all.to_csv("combined_cleaned_responses.csv", index=False)
display(df_all.head(20))

Unnamed: 0,Name of Sender,Comment,Issue,Substantive / Non-Substantive,Old / New,Response ID,RGU Consideration
0,Kathleen Whitson,PLEASE do NOT approve the mining. It will prof...,GEN,NS,X,1,
1,Mark,The environment will eventually be polluted by...,FIN,NS,X,1,
2,Bob Woodbury,Have there been other projects of this nature ...,PER,NS,X,1,
3,Bob Woodbury,"I could go on in this vein, but my point is th...",PER,NS,X,1,
4,Bob Woodbury,"We need to make a decision on what we know, no...",NEPA,NS,X,1,
5,Bob Woodbury,This is not just a Northern Minnesota concern....,NEPA,NS,X,1,
6,Bob Woodbury,What happens if the project is denied. There w...,ALT,NS,X,1,
7,Bob Woodbury,Our lands are vast. This isn't the only place ...,ALT,NS,X,1,
8,John-Marilyn Rossi,It is my understanding that returning the area...,ALT,S,O,8,"SDEIS Themes ALT 03, ALT 06, ALT 13"
9,John-Marilyn Rossi,Why is the state of Minnesota not requiring fu...,ALT,S,O,8,"SDEIS Themes ALT 03, ALT 06, ALT 13"


### Using BERTopic to process it

In [4]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt

comments = df_all["Comment"].tolist()

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
topic_model = BERTopic(embedding_model=embedding_model, verbose=True)
topics, probs = topic_model.fit_transform(comments)
topic_info_df = topic_model.get_topic_info()
topic_info_df.to_csv("results/bertopic_topic_info.csv", index=False)

  from .autonotebook import tqdm as notebook_tqdm





2025-05-28 09:12:03,921 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 140/140 [00:29<00:00,  4.74it/s]
2025-05-28 09:12:33,590 - BERTopic - Embedding - Completed ✓
2025-05-28 09:12:33,590 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-28 09:12:48,803 - BERTopic - Dimensionality - Completed ✓
2025-05-28 09:12:48,806 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-28 09:12:48,929 - BERTopic - Cluster - Completed ✓
2025-05-28 09:12:48,929 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-28 09:12:49,142 - BERTopic - Representation - Completed ✓


### Visualization

In [6]:
# import plotly.io as pio

# bar_chart = topic_model.visualize_barchart(top_n_topics=10)
# pio.write_image(bar_chart, "results/bertopic_barchart.png", format="png")

# topic_vis = topic_model.visualize_topics()
# pio.write_image(topic_vis, "results/bertopic_topics.png", format="png")

# hierarchy = topic_model.visualize_hierarchy()
# pio.write_image(hierarchy, "results/bertopic_hierarchy.png", format="png")

topic_model.visualize_barchart(top_n_topics=10).write_html("results/bar_chart.html")
topic_model.visualize_topics().write_html("results/topics.html")
topic_model.visualize_hierarchy().write_html("results/hierarchy.html")