In [1]:
import sys
sys.path.append("../src")  

In [2]:
import pandas as pd
import spacy
from collections import Counter

from config import CLEANED_COMPLAINTS_FILE  # <- path comes from your config.py

# If missing: python -m spacy download de_core_news_sm
nlp = spacy.load("de_core_news_sm", disable=["ner"])

#### Load cleaned texts - not supported by noun filtering (LDA + BERTopic columns)

In [3]:
df = pd.read_csv(CLEANED_COMPLAINTS_FILE)

lda_col = "lda_description"
bertopic_col = "bertopic_description"

missing = [c for c in [lda_col, bertopic_col] if c not in df.columns]
if missing:
    raise ValueError(f"Missing expected cleaned text columns in CSV: {missing}")

lda_texts = df[lda_col].fillna("").astype(str).tolist()
bertopic_texts = df[bertopic_col].fillna("").astype(str).tolist()

print("Rows:", len(df))
print("Example LDA cleaned:", lda_texts[0][:120])
print("Example BERTopic cleaned:", bertopic_texts[0][:120])

Rows: 930
Example LDA cleaned: bänke fehlen rückenlehn bruchstelle scharfkantig
Example BERTopic cleaned: bei 4 von 5 bänken fehlen die rückenlehnen bruchstellen sind scharfkantig


#### POS-tag tokens from full texts (context-aware)

In [4]:
# Purpose: Extract token frequencies AND POS tags using sentence context

def extract_token_pos_counts(texts, nlp, exclude_stop=True):
    token_counter = Counter()
    pos_counter = Counter()

    for doc in nlp.pipe(texts, batch_size=256):
        for t in doc:
            if t.is_space or t.is_punct:
                continue
            if exclude_stop and t.is_stop:
                continue

            tok = t.text.lower().strip()
            if not tok:
                continue

            token_counter[tok] += 1
            pos_counter[t.pos_] += 1

    return token_counter, pos_counter

#### Run POS evidence for LDA-cleaned texts

In [5]:
# Output:
# 1) Top-N frequent tokens + POS tag (context-aware)
# 2) POS distribution table (counts + %)

TOP_N = 50  # adjust: 30/50/100 depending on how much evidence you want

lda_token_counts, lda_pos_counts = extract_token_pos_counts(lda_texts, nlp, exclude_stop=True)

# Build top token table with POS (POS from spaCy in context is "most common POS for that token" across corpus)
# We'll compute POS-per-token frequency to assign the dominant POS label per token.
def dominant_pos_per_token(texts, nlp, tokens_of_interest, exclude_stop=True):
    token_pos = {tok: Counter() for tok in tokens_of_interest}

    for doc in nlp.pipe(texts, batch_size=256):
        for t in doc:
            if t.is_space or t.is_punct:
                continue
            if exclude_stop and t.is_stop:
                continue

            tok = t.text.lower().strip()
            if tok in token_pos:
                token_pos[tok][t.pos_] += 1

    # pick the most frequent POS per token
    return {tok: cnts.most_common(1)[0][0] if cnts else "NA" for tok, cnts in token_pos.items()}

top_lda_tokens = [w for w, _ in lda_token_counts.most_common(TOP_N)]
lda_dom_pos = dominant_pos_per_token(lda_texts, nlp, top_lda_tokens, exclude_stop=True)

lda_top_df = pd.DataFrame(
    [{"token": w, "freq": lda_token_counts[w], "dominant_pos": lda_dom_pos.get(w, "NA")} for w in top_lda_tokens]
)

lda_pos_df = (
    pd.DataFrame([{"pos": p, "count": c} for p, c in lda_pos_counts.most_common()])
    .assign(percentage=lambda d: (d["count"] / d["count"].sum() * 100).round(2))
)

display(lda_top_df)
display(lda_pos_df)

Unnamed: 0,token,freq,dominant_pos
0,ausfallen,192,VERB
1,defekt,132,NOUN
2,leuchte,126,VERB
3,mast,92,ADV
4,brunnen,90,NOUN
5,lampe,89,NOUN
6,leuchten,64,ADJ
7,komplett,60,ADV
8,laterne,52,ADV
9,beleuchtung,50,NOUN


Unnamed: 0,pos,count,percentage
0,NOUN,2222,33.95
1,ADV,1345,20.55
2,VERB,1224,18.7
3,PROPN,901,13.77
4,ADJ,609,9.31
5,ADP,78,1.19
6,DET,69,1.05
7,X,42,0.64
8,AUX,18,0.28
9,PRON,13,0.2


#### Run POS evidence for BERTopic-cleaned texts

In [6]:
# Output:
# 1) Top-N frequent tokens + POS tag (context-aware)
# 2) POS distribution table (counts + %)

ber_token_counts, ber_pos_counts = extract_token_pos_counts(bertopic_texts, nlp, exclude_stop=True)

top_ber_tokens = [w for w, _ in ber_token_counts.most_common(TOP_N)]
ber_dom_pos = dominant_pos_per_token(bertopic_texts, nlp, top_ber_tokens, exclude_stop=True)

ber_top_df = pd.DataFrame(
    [{"token": w, "freq": ber_token_counts[w], "dominant_pos": ber_dom_pos.get(w, "NA")} for w in top_ber_tokens]
)

ber_pos_df = (
    pd.DataFrame([{"pos": p, "count": c} for p, c in ber_pos_counts.most_common()])
    .assign(percentage=lambda d: (d["count"] / d["count"].sum() * 100).round(2))
)

display(ber_top_df)
display(ber_pos_df)

Unnamed: 0,token,freq,dominant_pos
0,ausgefallen,210,VERB
1,leuchte,124,VERB
2,defekt,123,NOUN
3,brunnen,88,NOUN
4,mast,83,NOUN
5,straße,75,NOUN
6,lampe,72,NOUN
7,bitte,63,ADV
8,leuchtet,56,VERB
9,str,54,X


Unnamed: 0,pos,count,percentage
0,NOUN,3084,41.38
1,VERB,1358,18.22
2,ADV,1099,14.75
3,ADJ,660,8.86
4,PROPN,594,7.97
5,NUM,458,6.15
6,X,63,0.85
7,ADP,52,0.7
8,DET,41,0.55
9,AUX,22,0.3


## Interpretation: Why noun-only filtering is not applied to the cleaning pipeline

Although nouns form the largest share of tokens in both LDA and BERTopic, the results show that **verbs and adverbs also play a role** in describing the complaints.

Many of the most frequent and informative words are **verbs** such as *ausfallen, fehlen, funktionieren, liegen* and **adverbs** such as *komplett, kaputt, dunkel*. These terms describe **what happened**, **whether something is working**, and **the severity of the issue**. Removing them would eliminate essential meaning from the texts.

While some POS tags are imperfect, the overall pattern is consistent across models: **important semantic information is not limited to nouns**.

For this reason, applying noun-only filtering during cleaning would discard relevant signal and likely harm topic quality. The current cleaning pipeline therefore deliberately retains multiple parts of speech to preserve the full meaning of citizen complaints.