In [3]:
import os
import re
import unicodedata
import pandas as pd
from collections import Counter
from rapidfuzz import fuzz, process
from rapidfuzz.distance import Levenshtein
from tqdm import tqdm

# ---------- CONFIG ----------
csv_file = "20250903_Extrait_Constatations_F2.csv"
comment_col = "Commentaire"
type_col = "Type de constatation Texte"

comment_min_occurrences = 7
threshold_similarity = 90
max_typo_chars = 4
fuzzy_match_threshold = 85
checkpoint_every = 50

# ---------- HELPERS ----------
def normalize_text(s: str) -> str:
    if pd.isna(s):
        return ""
    s = str(s)
    s = unicodedata.normalize("NFKC", s)
    s = s.replace("\xa0", " ").strip()
    return s

def clean_text_block(text: str) -> str:
    if not text:
        return ""
    text = normalize_text(text).lower()
    text = re.sub(r"[^a-zà-öø-ÿœæçß0-9\s-]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    words = [w for w in text.split() if len(w) > 1]
    return " ".join(words)

def join_variations_with_counts(variants, counter, delim=" | "):
    return delim.join([f"{v} ({counter.get(v,0)})" for v in variants])

# ---------- LOAD CSV ----------
df = pd.read_csv(csv_file, delimiter=";", dtype=str)
df.columns = [normalize_text(c) for c in df.columns]

if comment_col not in df.columns or type_col not in df.columns:
    raise KeyError(f"Expected columns '{comment_col}' and '{type_col}' not found. Available: {df.columns.tolist()}")

df[comment_col] = df[comment_col].astype(str)
df[type_col] = df[type_col].astype(str)

# ---------- EXTRACT COMMENTAIRE CLUSTERS ----------
print("\nExtracting full-text Commentaire clusters...")
cleaned_comments = [clean_text_block(t) for t in tqdm(df[comment_col])]
cluster_counter = Counter(cleaned_comments)
total_cluster_instances = sum(cluster_counter.values())
candidates = {k: v for k, v in cluster_counter.items() if v >= comment_min_occurrences}
print(f"Found {len(candidates)} unique Commentaire full-text candidates (≥{comment_min_occurrences} occurrences)")

# ---------- FUZZY CLUSTERING ----------
cluster_groups = {}
for cluster in tqdm(candidates.keys(), desc="Fuzzy clustering Commentaire (full text)"):
    found = False
    for rep in list(cluster_groups.keys()):
        sim = fuzz.ratio(cluster, rep)
        edit_d = Levenshtein.distance(cluster, rep)
        if sim >= threshold_similarity or edit_d <= max_typo_chars:
            cluster_groups[rep].append(cluster)
            found = True
            break
    if not found:
        cluster_groups[cluster] = [cluster]

# ---------- AGGREGATE COMMENTAIRE CLUSTERS ----------
comment_summary_rows = []
for rep, variants in cluster_groups.items():
    cnt = sum(cluster_counter.get(v, 0) for v in variants)
    per_mille = round(cnt / total_cluster_instances * 1000, 2) if total_cluster_instances > 0 else 0
    variations_text = join_variations_with_counts(variants, cluster_counter)
    comment_summary_rows.append({
        "Cluster_Representative": rep,
        "Count": cnt,
        "Variations": variations_text,
        "PerMille": per_mille,
    })

comment_summary_df = pd.DataFrame(comment_summary_rows).sort_values(by="Count", ascending=False)
comment_summary_df.to_csv("commentaire_clusters_summary_fulltext.csv", sep=";", index=False)
print("Saved 'commentaire_clusters_summary_fulltext.csv' with", len(comment_summary_df), "rows.")

# ---------- CORRELATE COMMENTAIRES ↔ TYPE ----------
print("\nCorrelating Type de constatation Texte to Commentaire clusters (vectorized)...")
comment_texts = [c.lower() for c in cleaned_comments]
type_values = df[type_col].tolist()
corr_rows = []

for i, (rep, variants) in enumerate(tqdm(cluster_groups.items(), desc="Vectorized correlation")):
    matches = process.cdist([rep], comment_texts, scorer=fuzz.partial_ratio)
    matched_indices = [j for j, score in enumerate(matches[0]) if score >= fuzzy_match_threshold]
    if not matched_indices:
        continue
    matched_types = [type_values[j] for j in matched_indices]
    type_counts = Counter(matched_types)
    total_type_count = sum(type_counts.values())
    for t_val, t_cnt in type_counts.items():
        t_per_mille = round(t_cnt / total_type_count * 1000, 2)
        corr_rows.append({
            "Commentaire_Cluster": rep,
            "Commentaire_Variations": join_variations_with_counts(variants, cluster_counter),
            "Commentaire_Count": sum(cluster_counter.get(v, 0) for v in variants),
            "Type_Value": t_val,
            "Type_Count": t_cnt,
            "Type_PerMille": t_per_mille,
        })

    if (i + 1) % checkpoint_every == 0:
        partial_df = pd.DataFrame(corr_rows)
        partial_df.to_csv(f"partial_corr_checkpoint_{i+1}_fulltext.csv", sep=";", index=False)
        print(f"Checkpoint {i+1}: saved {len(partial_df)} rows.")

corr_df = pd.DataFrame(corr_rows)
corr_df.to_csv("commentaire_type_correlations_fulltext.csv", sep=";", index=False)
print("Saved 'commentaire_type_correlations_fulltext.csv' with", len(corr_df), "rows.")



Extracting full-text Commentaire clusters...


100%|████████████████████████████████████████████████████████████████████████| 48227/48227 [00:00<00:00, 117880.37it/s]


Found 347 unique Commentaire full-text candidates (≥7 occurrences)


Fuzzy clustering Commentaire (full text): 100%|█████████████████████████████████████| 347/347 [00:01<00:00, 181.99it/s]


Saved 'commentaire_clusters_summary_fulltext.csv' with 204 rows.

Correlating Type de constatation Texte to Commentaire clusters (vectorized)...


Vectorized correlation:  25%|█████████████▉                                           | 50/204 [06:54<13:28,  5.25s/it]

Checkpoint 50: saved 1586 rows.


Vectorized correlation:  49%|███████████████████████████▍                            | 100/204 [13:41<09:28,  5.47s/it]

Checkpoint 100: saved 2772 rows.


Vectorized correlation:  74%|█████████████████████████████████████████▏              | 150/204 [21:57<09:35, 10.66s/it]

Checkpoint 150: saved 3856 rows.


Vectorized correlation:  98%|██████████████████████████████████████████████████████▉ | 200/204 [31:07<01:11, 17.91s/it]

Checkpoint 200: saved 4764 rows.


Vectorized correlation: 100%|████████████████████████████████████████████████████████| 204/204 [33:55<00:00,  9.98s/it]

Saved 'commentaire_type_correlations_fulltext.csv' with 4823 rows.



