# Processing public comments

### Formatting / setting up the data

In [3]:
import tabula
import pandas as pd
import string

pdf_path = "Public comments - Northmet.pdf"
tables = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True, lattice=True)
tables_no_headers = [df.iloc[1:].copy() for df in tables if len(df) > 1]
df_all = pd.concat(tables_no_headers, ignore_index=True)

for col in df_all.select_dtypes(include='object').columns:
    df_all[col] = df_all[col].map(lambda x: str(x).replace('\r', ' ').replace('\n', ' ').strip() if pd.notnull(x) else x)

df_all.columns = [
    "Name of Sender",
    "Comment",
    "Issue",
    "Substantive / Non-Substantive",
    "Old / New",
    "Response ID",
    "RGU Consideration"
][:len(df_all.columns)]

rows_to_drop = []
for i in range(1, len(df_all)):
    name = df_all.iloc[i, 0]
    if pd.isna(name) or str(name).strip() == '':
        prev_comment = str(df_all.at[i - 1, "Comment"]).strip()
        curr_comment = str(df_all.at[i, "Comment"]).strip()
        df_all.at[i - 1, "Comment"] = prev_comment + ' ' + curr_comment
        rows_to_drop.append(i)

df_all.drop(index=rows_to_drop, inplace=True)
df_all.reset_index(drop=True, inplace=True)

df_all["Comment"] = df_all["Comment"].astype(str).map(lambda x: x.strip()).replace("", pd.NA)
df_all = df_all.dropna(subset=["Comment"])

df_all.to_csv("combined_responses.csv", index=False)

filler_words = set([
    "the", "and", "but", "or", "so", "because", "like", "just", "really", "very",
    "actually", "basically", "literally", "you", "know", "i", "me", "my", "we", "us",
    "our", "he", "she", "they", "them", "their", "it", "its", "a", "an", "to", "of",
    "in", "on", "at", "with", "for", "from", "by", "that", "this", "is", "was", "are",
    "were", "be", "been", "do", "does", "did", "have", "has", "had", "if", "as", "am"
])

def clean_comment(comment):
    comment = comment.lower()
    comment = comment.translate(str.maketrans('', '', string.punctuation))
    words = comment.split()
    cleaned_words = [word for word in words if word not in filler_words]

    return ' '.join(cleaned_words)

df_all["Comment"] = df_all["Comment"].apply(clean_comment)

df_all.to_csv("combined_cleaned_responses.csv", index=False)

display(df_all.head(20))

Unnamed: 0,Name of Sender,Comment,Issue,Substantive / Non-Substantive,Old / New,Response ID,RGU Consideration
0,Kathleen Whitson,please not approve mining will profit owners w...,GEN,NS,X,1,
1,Mark,environment will eventually polluted northmet ...,FIN,NS,X,1,
2,Bob Woodbury,there other projects nature successful what de...,PER,NS,X,1,
3,Bob Woodbury,could go vein point need rely what “with techn...,PER,NS,X,1,
4,Bob Woodbury,need make decision what not what think,NEPA,NS,X,1,
5,Bob Woodbury,not northern minnesota concern not minnesota c...,NEPA,NS,X,1,
6,Bob Woodbury,what happens project denied there will other l...,ALT,NS,X,1,
7,Bob Woodbury,lands vast isnt only place country where these...,ALT,NS,X,1,
8,John-Marilyn Rossi,understanding returning area affected mining o...,ALT,S,O,8,"SDEIS Themes ALT 03, ALT 06, ALT 13"
9,John-Marilyn Rossi,why state minnesota not requiring full reclama...,ALT,S,O,8,"SDEIS Themes ALT 03, ALT 06, ALT 13"


### Using BERTopic to process it

In [4]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt

comments = df_all["Comment"].tolist()

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
topic_model = BERTopic(embedding_model=embedding_model, verbose=True)
topics, probs = topic_model.fit_transform(comments)
topic_info_df = topic_model.get_topic_info()
topic_info_df.to_csv("results/bertopic_topic_info.csv", index=False)

  from .autonotebook import tqdm as notebook_tqdm





2025-06-02 15:13:56,435 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 140/140 [00:21<00:00,  6.42it/s]
2025-06-02 15:14:18,590 - BERTopic - Embedding - Completed ✓
2025-06-02 15:14:18,591 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-02 15:14:34,578 - BERTopic - Dimensionality - Completed ✓
2025-06-02 15:14:34,581 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-02 15:14:34,717 - BERTopic - Cluster - Completed ✓
2025-06-02 15:14:34,720 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-02 15:14:34,885 - BERTopic - Representation - Completed ✓


### Visualization

In [5]:
# import plotly.io as pio

# print("0/3")
# bar_chart = topic_model.visualize_barchart(top_n_topics=10)
# pio.write_image(bar_chart, "results/bertopic_barchart.png", format="png")
# print("1/3")
# topic_vis = topic_model.visualize_topics()
# pio.write_image(topic_vis, "results/bertopic_topics.png", format="png")
# print("2/3")
# hierarchy = topic_model.visualize_hierarchy()
# pio.write_image(hierarchy, "results/bertopic_hierarchy.png", format="png")
# print("3/3")
topic_model.visualize_barchart(top_n_topics=10).write_html("results/bar_chart.html")
topic_model.visualize_topics().write_html("results/topics.html")
topic_model.visualize_hierarchy().write_html("results/hierarchy.html")