# üé≠ Pun Translator (Scalable: Lexique3 + ConceptNet + WordNet)

**Verbose, reproducible pun translation without a hand-written pun dictionary.**

This version removes the hard-coded `known_puns`, `ipa_map`, and `relatedness` tables and replaces them with:

- **WordNet (English)** to propose candidate pun words + two distinct senses automatically
- **Lexique 3 (French)** to find homophones by shared phonological form (offline, fast)
- **ConceptNet (free API)** to score semantic relatedness between the homophone and the target meaning

‚ö†Ô∏è One-time setup: download **Lexique 3** from lexique.org and point the notebook to the TSV/CSV file path.


In [None]:
!pip install -q deep-translator requests pandas nltk
print('‚úÖ Installed dependencies')

In [None]:
'''
Summary: imports + core classes + translator engine

Load the libraries, ensures WordNet is available, and defines the main translation components: 
an English‚ÜíFrench translator with caching, a Lexique-based French homophone index, a 
ConceptNet-based semantic similarity scorer, and a verbose ‚Äúpolygon‚Äù solver that tries 
progressively smarter ways to preserve a pun in French.
'''

# Imports + WordNet bootstrap / dependencies

from deep_translator import GoogleTranslator
from typing import List, Optional, Tuple, Dict
from dataclasses import dataclass
import time
import re
import os
import json
import difflib

import pandas as pd
import requests
import nltk
from nltk.corpus import wordnet as wn

# --- NLTK data (WordNet) ---
# Free and only needs to be downloaded once per runtime.
try:
    _ = wn.synsets("bank")
except LookupError:
    nltk.download("wordnet")
    nltk.download("omw-1.4")
    _ = wn.synsets("bank")

In [None]:
# Data models (dataclasses) - structured return objects

@dataclass
class TranslationCandidate:
    pun_word: str
    polygon_level: int
    path: List[str]
    explanation: str
    confidence: float

@dataclass
class FallbackTranslation:
    strategy: str
    translation: str
    explanation: str

In [None]:
# Translation wrapper - cached EN‚ÜíFR translation.

class RealBilingualDict:
    """Free translation via deep-translator (Google Translate under the hood)."""
    def __init__(self, source_lang='en', target_lang='fr'):
        self.translator = GoogleTranslator(source=source_lang, target=target_lang)
        self.cache: Dict[str, List[str]] = {}

    def translate(self, word: str) -> List[str]:
        word_lower = word.lower().strip()
        if not word_lower:
            return [word]
        if word_lower in self.cache:
            return self.cache[word_lower]
        try:
            translation = self.translator.translate(word_lower)
            self.cache[word_lower] = [translation]
            time.sleep(0.05)  # be polite to the service
            return [translation]
        except Exception:
            return [word]

In [None]:
# Lexique homophone index - builds the offline phonetic lookup.

class LexiquePhoneticIndex:
    """
    Offline homophone lookup for French using Lexique 3.

    Provide the path to a Lexique 3 file (TSV/CSV). We build:
      phon_form -> [orthographic forms]

    Lexique columns vary slightly by file; we auto-detect likely columns.
    """
    def __init__(self, lexique_path: str, encoding: str = "utf-8"):
        if not os.path.exists(lexique_path):
            raise FileNotFoundError(
                f"Lexique file not found: {lexique_path}\n"
                "Download Lexique 3 from lexique.org and update the path."
            )

        # Try TSV first, then CSV
        try:
            df = pd.read_csv(lexique_path, sep='\t', encoding=encoding)
        except Exception:
            df = pd.read_csv(lexique_path, encoding=encoding)

        cols = {c.lower(): c for c in df.columns}
        ortho_col = cols.get("ortho") or cols.get("orth") or cols.get("word") or cols.get("lemme") or list(df.columns)[0]
        phon_col = cols.get("phon") or cols.get("phonology") or cols.get("phon_ortho") or cols.get("ipa") or cols.get("phono")
        if phon_col is None:
            raise ValueError(
                "Could not find a phonetic column in the Lexique file.\n"
                "Expected a column like 'phon' (common in Lexique 3).\n"
                f"Columns present: {list(df.columns)[:50]}"
            )

        self.ortho_col = ortho_col
        self.phon_col = phon_col

        self.phon_to_words: Dict[str, List[str]] = {}
        self.word_to_phon: Dict[str, str] = {}

        for _, row in df[[ortho_col, phon_col]].dropna().iterrows():
            w = str(row[ortho_col]).strip().lower()
            p = str(row[phon_col]).strip()
            if not w or not p:
                continue
            self.word_to_phon[w] = p
            self.phon_to_words.setdefault(p, []).append(w)

        for p, words in self.phon_to_words.items():
            seen = set()
            deduped = []
            for w in words:
                if w not in seen:
                    deduped.append(w)
                    seen.add(w)
            self.phon_to_words[p] = deduped

    def find_homophones(self, french_word: str, limit: int = 30) -> List[str]:
        w = french_word.lower().strip()
        p = self.word_to_phon.get(w)
        if not p:
            return []
        cands = [x for x in self.phon_to_words.get(p, []) if x != w]
        return cands[:limit]

In [None]:
# ConceptNet semantics - does semantic relatedness via a free API + disk cache.

class ConceptNetSemantic:
    """
    Free semantic relatedness scoring using ConceptNet's public API.

    We approximate similarity by:
    - getting a weighted related-term list for word1,
    - seeing whether word2 appears in that list (and vice versa),
    - using a symmetric score.
    """
    def __init__(self, lang: str = "fr", cache_path: str = "conceptnet_cache.json"):
        self.lang = lang
        self.cache_path = cache_path
        self._cache: Dict[str, Dict[str, float]] = {}
        self._load_cache()

    def _load_cache(self):
        if os.path.exists(self.cache_path):
            try:
                with open(self.cache_path, "r", encoding="utf-8") as f:
                    self._cache = json.load(f)
            except Exception:
                self._cache = {}

    def _save_cache(self):
        try:
            with open(self.cache_path, "w", encoding="utf-8") as f:
                json.dump(self._cache, f, ensure_ascii=False, indent=2)
        except Exception:
            pass

    def _related_map(self, word: str, limit: int = 50) -> Dict[str, float]:
        w = word.lower().strip()
        if not w:
            return {}
        if w in self._cache:
            return self._cache[w]

        concept = f"/c/{self.lang}/{w}"
        url = f"https://api.conceptnet.io/related{concept}?filter=/c/{self.lang}&limit={limit}"

        try:
            r = requests.get(url, timeout=10)
            r.raise_for_status()
            data = r.json()
            rels = {}
            for item in data.get("related", []):
                cid = item.get("@id", "")
                m = re.match(rf"^/c/{self.lang}/(.+)$", cid)
                if not m:
                    continue
                term = m.group(1).replace("_", " ").lower()
                rels[term] = float(item.get("weight", 0.0))
            self._cache[w] = rels
            self._save_cache()
            time.sleep(0.05)
            return rels
        except Exception:
            self._cache[w] = {}
            return {}

    def semantic_similarity(self, word1: str, word2: str) -> float:
        w1 = word1.lower().strip()
        w2 = word2.lower().strip()
        if not w1 or not w2:
            return 0.0
        if w1 == w2:
            return 1.0

        m1 = self._related_map(w1)
        m2 = self._related_map(w2)

        s = max(m1.get(w2, 0.0), m2.get(w1, 0.0))
        return min(1.0, s / 10.0)  # normalize

In [None]:
# Polygon solver - composes the three engines into square/pentagon/hexagon attempts.

class VerboseLowPolygonalTranslator:
    """VERBOSE pun translator with scalable phonetics + semantics."""

    def __init__(self, lexique_path: str):
        self.bilingual_dict = RealBilingualDict('en', 'fr')
        self.phonetic_dict = LexiquePhoneticIndex(lexique_path)
        self.semantic_dict = ConceptNetSemantic(lang="fr")
        self.MIN_SEMANTIC_SIM = 0.25  # ConceptNet-normalized scale

    def translate_pun_verbose(self, meaning1: str, meaning2: str, max_polygon: int = 8):
        print(f"\n{'‚ñ¨'*70}")
        print("üîç POLYGON TRANSLATION ATTEMPTS")
        print(f"{'‚ñ¨'*70}")
        print(f"   Meanings: '{meaning1}' ‚Üî '{meaning2}'")
        print(f"   Will try polygons 4 through {max_polygon}")
        print(f"{'‚ñ¨'*70}\n")

        for level in range(4, min(max_polygon + 1, 9)):
            polygon_name = ["SQUARE", "PENTAGON", "HEXAGON", "HEPTAGON", "OCTAGON"][level-4]

            print(f"\nüî∏ Attempting {polygon_name} ({level}-gon)...")
            print(f"   {'‚îÄ'*66}")

            if level == 4:
                result = self._attempt_square_verbose(meaning1, meaning2)
            elif level == 5:
                result = self._attempt_pentagon_verbose(meaning1, meaning2)
            elif level == 6:
                result = self._attempt_hexagon_verbose(meaning1, meaning2)
            elif level == 7:
                result = self._attempt_heptagon_verbose(meaning1, meaning2)
            elif level == 8:
                result = self._attempt_octagon_verbose(meaning1, meaning2)
            else:
                print(f"   Skipping {polygon_name} (not implemented)")
                result = None

            if result:
                print(f"\n   ‚úÖ SUCCESS at {polygon_name}!")
                print(f"   {'‚îÄ'*66}\n")
                return result, None
            else:
                print(f"   ‚ùå {polygon_name} failed - no solution found")
                print(f"   {'‚îÄ'*66}")

        print(f"\n{'='*70}")
        print("‚ö†Ô∏è  ALL POLYGONS FAILED (4-8)")
        print(f"{'='*70}")
        print("   Using fallback: LITERAL TRANSLATION")
        print(f"{'='*70}\n")

        t1 = self.bilingual_dict.translate(meaning1)[0]
        fallback = FallbackTranslation(
            strategy="Literal Translation",
            translation=f"{t1}",
            explanation=f"No pun solution found. Translated '{meaning1}' literally to '{t1}'"
        )
        return None, fallback

    def _attempt_square_verbose(self, m1: str, m2: str):
        print("   Method: Direct translation of both meanings")
        t1s = self.bilingual_dict.translate(m1)
        t2s = self.bilingual_dict.translate(m2)
        print(f"   '{m1}' ‚Üí {t1s}")
        print(f"   '{m2}' ‚Üí {t2s}")

        for t1 in t1s:
            for t2 in t2s:
                if t1 == t2:
                    print(f"   Found: '{t1}' = '{t2}' (same word!)")
                    return TranslationCandidate(
                        pun_word=t1, polygon_level=4,
                        path=[m1, t1, t2, m2],
                        explanation="Square: direct translation match",
                        confidence=1.0
                    )
        print("   No direct match found")
        return None

    def _attempt_pentagon_verbose(self, m1: str, m2: str):
        print("   Method: Translate ‚Üí homophone (Lexique) ‚Üí semantic check (ConceptNet)")
        t1s = self.bilingual_dict.translate(m1)
        t2s = self.bilingual_dict.translate(m2)
        print(f"   '{m1}' ‚Üí {t1s}")
        print(f"   '{m2}' ‚Üí {t2s}")

        for t1 in t1s:
            homophones = self.phonetic_dict.find_homophones(t1)
            print(f"   Homophones of '{t1}' (Lexique): {homophones[:12]}{'...' if len(homophones) > 12 else ''}")

            for homophone in homophones:
                for t2 in t2s:
                    sim = self.semantic_dict.semantic_similarity(homophone, t2)
                    print(f"   Similarity('{homophone}', '{t2}'): {sim:.2f}")
                    if sim >= self.MIN_SEMANTIC_SIM:
                        print(f"   ‚úì Semantic match! {sim:.2f} ‚â• {self.MIN_SEMANTIC_SIM}")
                        return TranslationCandidate(
                            pun_word=homophone, polygon_level=5,
                            path=[m1, t1, homophone, t2, m2],
                            explanation="Pentagon: translation ‚Üí homophone ‚Üí semantic match",
                            confidence=sim
                        )

        print("   No homophone passed semantic threshold")
        return None

    def _attempt_hexagon_verbose(self, m1: str, m2: str):
        print("   Method: Translate ‚Üí synonym (WordNet EN) ‚Üí translate ‚Üí homophone (Lexique) ‚Üí semantic check (ConceptNet)")
        # Step 1: translate target meaning (B) once
        t2s = self.bilingual_dict.translate(m2)
        print(f"   Target meaning '{m2}' ‚Üí {t2s}")

        # Step 2: generate English synonym candidates for m1 (plus m1 itself)
        base = m1.lower().strip()
        syns = set()
        for ss in wn.synsets(base):
            for lem in ss.lemmas():
                name = lem.name().replace("_", " ").lower().strip()
                # keep simple single-token synonyms to reduce drift
                if name.isalpha() and 3 <= len(name) <= 20:
                    syns.add(name)
        syn_list = [base] + sorted(syns - {base})
        syn_list = syn_list[:12]  # keep bounded & fast
        print(f"   Synonym candidates for '{m1}': {syn_list}")

        # Step 3: for each synonym, translate to French, then search homophones and validate semantically
        for syn_en in syn_list:
            fr_syns = self.bilingual_dict.translate(syn_en)
            print(f"   '{syn_en}' ‚Üí {fr_syns}")

            for fr in fr_syns:
                homophones = self.phonetic_dict.find_homophones(fr)
                if homophones:
                    preview = homophones[:12]
                    print(f"   Homophones of '{fr}' (Lexique): {preview}{'...' if len(homophones) > 12 else ''}")
                else:
                    print(f"   Homophones of '{fr}' (Lexique): []")
                    continue

                for homophone in homophones:
                    for t2 in t2s:
                        sim = self.semantic_dict.semantic_similarity(homophone, t2)
                        print(f"   Similarity('{homophone}', '{t2}'): {sim:.2f}")
                        if sim >= self.MIN_SEMANTIC_SIM:
                            print(f"   ‚úì Semantic match! {sim:.2f} ‚â• {self.MIN_SEMANTIC_SIM}")
                            return TranslationCandidate(
                                pun_word=homophone, polygon_level=6,
                                path=[m1, syn_en, fr, homophone, t2, m2],
                                explanation="Hexagon: synonym ‚Üí translation ‚Üí homophone ‚Üí semantic match",
                                confidence=sim
                            )

        print("   No synonym/homophone passed semantic threshold")
        return None

    # Copy this code and add it to your notebook after the hexagon method

    def _attempt_heptagon_verbose(self, m1: str, m2: str):
        """
        Heptagon (7-gon): 3 semantic/phonetic leaps
        Path: m1 ‚Üí t1 ‚Üí FR_synonym ‚Üí homophone ‚Üí FR_synonym ‚Üí t2 ‚Üí m2
        """
        print("   Method: Translate ‚Üí FR synonym ‚Üí homophone ‚Üí FR synonym ‚Üí check")
        
        t1s = self.bilingual_dict.translate(m1)
        t2s = self.bilingual_dict.translate(m2)
        print(f"   '{m1}' ‚Üí {t1s}")
        print(f"   '{m2}' ‚Üí {t2s}")
        
        for t1 in t1s[:3]:
            related1 = self.semantic_dict._related_map(t1)
            fr_syns1 = [w for w, score in related1.items() if score > 3.0][:10]
            
            if fr_syns1:
                print(f"   FR synonyms of '{t1}': {fr_syns1[:5]}...")
            
            for fr_syn in fr_syns1:
                homos = self.phonetic_dict.find_homophones(fr_syn)
                
                if homos:
                    print(f"   Homophones of '{fr_syn}': {homos[:5]}...")
                
                for homo in homos[:10]:
                    related_h = self.semantic_dict._related_map(homo)
                    fr_syns_h = [w for w, score in related_h.items() if score > 3.0][:10]
                    
                    for syn_h in fr_syns_h:
                        for t2 in t2s:
                            sim = self.semantic_dict.semantic_similarity(syn_h, t2)
                            
                            if sim >= self.MIN_SEMANTIC_SIM:
                                print(f"   ‚úì 7-gon match! {sim:.2f} ‚â• {self.MIN_SEMANTIC_SIM}")
                                return TranslationCandidate(
                                    pun_word=homo,
                                    polygon_level=7,
                                    path=[m1, t1, fr_syn, homo, syn_h, t2, m2],
                                    explanation="Heptagon: 3 leaps (synonym‚Üíhomophone‚Üísynonym)",
                                    confidence=sim * 0.8
                                )
        
        print("   No 7-gon path found")
        return None


    def _attempt_octagon_verbose(self, m1: str, m2: str):
        """
        Octagon (8-gon): 4 semantic/phonetic leaps
        Path: m1 ‚Üí t1 ‚Üí syn1 ‚Üí homo1 ‚Üí syn2 ‚Üí homo2 ‚Üí t2 ‚Üí m2
        """
        print("   Method: Translate ‚Üí syn ‚Üí homo ‚Üí syn ‚Üí homo ‚Üí check")
        print("   ‚ö†Ô∏è  8-gon paths are very creative (potentially tenuous)")
        
        t1s = self.bilingual_dict.translate(m1)
        t2s = self.bilingual_dict.translate(m2)
        print(f"   '{m1}' ‚Üí {t1s}")
        print(f"   '{m2}' ‚Üí {t2s}")
        
        # Very limited search
        for t1 in t1s[:2]:
            related1 = self.semantic_dict._related_map(t1)
            syns1 = [w for w, score in related1.items() if score > 4.0][:5]
            
            if syns1:
                print(f"   Synonyms of '{t1}': {syns1[:3]}...")
            
            for syn1 in syns1:
                homos1 = self.phonetic_dict.find_homophones(syn1)[:5]
                
                for homo1 in homos1:
                    related2 = self.semantic_dict._related_map(homo1)
                    syns2 = [w for w, score in related2.items() if score > 4.0][:5]
                    
                    for syn2 in syns2:
                        homos2 = self.phonetic_dict.find_homophones(syn2)[:5]
                        
                        for homo2 in homos2:
                            for t2 in t2s:
                                sim = self.semantic_dict.semantic_similarity(homo2, t2)
                                
                                if sim >= self.MIN_SEMANTIC_SIM:
                                    print(f"   ‚úì 8-gon match! {sim:.2f} ‚â• {self.MIN_SEMANTIC_SIM}")
                                    return TranslationCandidate(
                                        pun_word=homo2,
                                        polygon_level=8,
                                        path=[m1, t1, syn1, homo1, syn2, homo2, t2, m2],
                                        explanation="Octagon: 4 leaps (very creative path)",
                                        confidence=sim * 0.6
                                    )
        
        print("   No 8-gon path found")
        return None

    def translate_sentence(self, sentence: str, pun_word_original: str, pun_word_french: str) -> str:
        words = sentence.split()
        french_words = []

        for word in words:
            clean_word = re.sub(r"[^\w']", "", word.lower())
            if clean_word == pun_word_original.lower():
                french_words.append(pun_word_french)
            else:
                translations = self.bilingual_dict.translate(clean_word)
                french_words.append(translations[0] if translations else word)

        return " ".join(french_words)

print("‚úÖ Scalable translator loaded (Lexique + ConceptNet + WordNet)")

In [None]:
'''
Summary: AutoPunDetector

This cell defines the pun-word detector that tokenizes the sentence, filters stopwords, then 
either flags an out-of-vocabulary ‚Äúmade-up‚Äù token as the pun (and guesses what it‚Äôs blending), 
or falls back to WordNet to pick a real word with two far-apart senses, boosted if it repeats 
in the sentence.
'''

class AutoPunDetector:
    """Automatically proposes a pun word + two distinct meanings.

    Improvements:
    1) **Made-up / OOV word handling (portmanteau detector)**:
       - If a token has *no* WordNet synsets, treat it as a high-probability pun candidate.
       - Try to infer two meanings by:
         a) finding a close real-word match (string similarity over WordNet lemmas),
         b) finding an in-word substring that is itself a real word (e.g., 'impasta' contains 'pasta').

       This fixes cases like: "A fake noodle is an impasta" ‚Üí pun word should be "impasta".

    2) **Surface repetition** remains a strong signal (e.g. "bank ... bank").

    3) For normal in-vocab words, uses WordNet sense distance to propose two distinct sense labels.
    """

    def __init__(self):
        self.stop = {
            "the","a","an","and","or","but","if","then","else","to","of","in","on","at","for","with",
            "is","are","was","were","be","been","being","it","this","that","these","those",
            "i","you","he","she","we","they","me","him","her","us","them","my","your","his","their",
            "as","by","from","not","no","so"
        }

        # Build a lightweight WordNet lemma vocabulary for fuzzy matching.
        # (One-time per runtime; cached on the detector instance.)
        lemmas = set()
        for name in wn.all_lemma_names():
            if not name:
                continue
            if "_" in name:
                continue
            n = name.lower()
            if not n.isalpha():
                continue
            if len(n) < 3:
                continue
            lemmas.add(n)
        self._wn_vocab = sorted(lemmas)

    def _tokenize(self, sentence: str) -> List[str]:
        return re.findall(r"[A-Za-z']+", sentence.lower())

    def _clean_tokens(self, sentence: str) -> List[str]:
        toks = self._tokenize(sentence)
        return [t for t in toks if t and t not in self.stop]

    def _is_oov(self, word: str) -> bool:
        return len(wn.synsets(word)) == 0

    def _best_close_match(self, word: str) -> Optional[str]:
        # difflib is fast enough for our lemma list sizes at n=1.
        matches = difflib.get_close_matches(word.lower(), self._wn_vocab, n=1, cutoff=0.78)
        return matches[0] if matches else None

    def _best_subword(self, word: str) -> Optional[str]:
        # Look for the longest substring (>=4 chars) inside the word that is itself a WordNet word.
        w = word.lower()
        best = None
        best_len = 0
        for i in range(len(w)):
            for j in range(i + 4, len(w) + 1):
                sub = w[i:j]
                if len(sub) <= best_len:
                    continue
                if sub in self._wn_vocab or wn.synsets(sub):
                    best = sub
                    best_len = len(sub)
        return best

    def _best_synset_pair(self, synsets):
        best = None
        best_d = -1
        for i in range(len(synsets)):
            for j in range(i + 1, len(synsets)):
                d = synsets[i].shortest_path_distance(synsets[j])
                if d is None:
                    continue
                if d > best_d:
                    best_d = d
                    best = (synsets[i], synsets[j], d)
        return best

    def _sense_label(self, synset, surface: str) -> str:
        """Pick a readable label for a synset that's *not* just the surface word."""
        surface = surface.lower().replace(" ", "_")
        for lem in synset.lemmas():
            name = lem.name().lower()
            if name != surface:
                return name.replace("_", " ")
        definition = synset.definition()
        short = " ".join(definition.split()[:6])
        return short

    def _detect_oov_pun(self, tokens: List[str], counts: Dict[str, int]) -> Optional[Tuple[str, str, str, str]]:
        # Score OOV tokens as likely puns, and try to infer two meanings.
        best = None  # (score, word, m1, m2, explanation)

        for w, cnt in sorted(counts.items(), key=lambda kv: (-kv[1], kv[0])):
            if not self._is_oov(w):
                continue

            close = self._best_close_match(w)
            sub = self._best_subword(w)

            # Build two meanings if possible.
            meanings = []
            if close:
                meanings.append(close)
            if sub and sub != close:
                meanings.append(sub)

            if len(meanings) < 2:
                # If we only got one, still consider it, but penalize.
                pass

            # Suspicion score: OOV is a huge signal, then repetition, then evidence of portmanteau structure.
            score = 50.0
            score += 8.0 * max(0, cnt - 1)  # repetition still matters
            score += min(6.0, max(0, len(w) - 5)) * 0.8  # longer weird words are more suspicious
            if close:
                score += 10.0
            if sub:
                score += 10.0
            if len(meanings) < 2:
                score -= 12.0

            # Prepare output meanings.
            m1 = meanings[0] if meanings else w
            m2 = meanings[1] if len(meanings) > 1 else (close or sub or w)

            explanation = (
                f"Auto-detected '{w}' as a likely pun word because it has no WordNet senses (OOV).\n"
                f"Evidence: close real-word match ‚Üí {close!r}; in-word real-word substring ‚Üí {sub!r}; count {cnt}.\n"
                f"Using meanings: A='{m1}', B='{m2}'."
            )

            cand = (score, w, m1, m2, explanation)
            if (best is None) or (cand[0] > best[0]):
                best = cand

        if not best:
            return None
        _, w, m1, m2, explanation = best
        return (w, m1, m2, explanation)

    def detect(self, sentence: str) -> Optional[Tuple[str, str, str, str]]:
        tokens = self._clean_tokens(sentence)
        if not tokens:
            return None

        counts: Dict[str, int] = {}
        for t in tokens:
            counts[t] = counts.get(t, 0) + 1

        # 1) Prefer made-up / OOV pun candidates (portmanteau-style).
        oov = self._detect_oov_pun(tokens, counts)
        if oov:
            return oov

        # 2) Otherwise, fall back to WordNet polysemy + repetition scoring.
        best = None  # (score, repeat_count, dist, word, meaning1, meaning2, explanation)

        for w, cnt in sorted(counts.items(), key=lambda kv: (-kv[1], kv[0])):
            synsets = wn.synsets(w)
            if len(synsets) < 2:
                continue

            pair = self._best_synset_pair(synsets[:10])
            if not pair:
                continue

            s1, s2, dist = pair

            m1 = self._sense_label(s1, w)
            m2 = self._sense_label(s2, w)
            if m1 == m2:
                m1 = s1.name().split(".")[0].replace("_", " ")
                m2 = s2.name().split(".")[0].replace("_", " ")

            repetition_bonus = 15 * max(0, cnt - 1)
            polysemy_bonus = min(3.0, 0.25 * len(synsets))
            score = dist + repetition_bonus + polysemy_bonus

            explanation = (
                f"Auto-detected '{w}' as a possible pun word (score {score:.2f}; "
                f"distance {dist}; count {cnt}).\n"
                f"Sense A: {s1.name()} ‚Äî {s1.definition()}\n"
                f"Sense B: {s2.name()} ‚Äî {s2.definition()}"
            )

            cand = (score, cnt, dist, w, m1, m2, explanation)
            if (best is None) or (cand[:3] > best[:3]):
                best = cand

        if not best:
            return None

        _, _, _, w, m1, m2, explanation = best
        return (w, m1, m2, explanation)

print("‚úÖ AutoPunDetector loaded (OOV + WordNet-based)")


In [None]:
'''
Summary: translate_pun_complete orchestrator

This cell is the end-to-end pipeline: it prints the input, calls the auto pun detector, runs 
the polygon translator on the two meanings if a pun is found, and otherwise falls back to a 
literal word-by-word translation.
'''

def translate_pun_complete(sentence: str, lexique_path: str, show_details: bool = True):
    """Translate a sentence, attempting an automatically detected pun first."""

    print(f"\n{'='*70}")
    print("üìù ENGLISH INPUT")
    print(f"{'='*70}")
    print(f"   {sentence}")
    print(f"{'='*70}")

    detector = AutoPunDetector()
    result = detector.detect(sentence)

    if not result:
        print(f"\n{'='*70}")
        print("‚ö†Ô∏è  NO PUN CANDIDATE DETECTED - DOING LITERAL TRANSLATION")
        print(f"{'='*70}")
        translator = RealBilingualDict('en','fr')
        words = sentence.split()
        french_words = []
        print("\nWord-by-word translation:")
        for word in words:
            clean = re.sub(r"[^\w']", "", word.lower())
            trans = translator.translate(clean)[0]
            french_words.append(trans)
            print(f"   {clean} ‚Üí {trans}")
        return " ".join(french_words)

    pun_word, meaning1, meaning2, explain = result

    print(f"\n{'='*70}")
    print("üéØ PUN DETECTED (AUTO)")
    print(f"{'='*70}")
    print(f"   Pun word: {pun_word}")
    print(f"   Meaning A: {meaning1}")
    print(f"   Meaning B: {meaning2}")
    print(f"\n   Details:\n{explain}")
    print(f"{'='*70}")

    translator = VerboseLowPolygonalTranslator(lexique_path=lexique_path)
    candidate, fallback = translator.translate_pun_verbose(meaning1, meaning2)

    if candidate:
        print(f"\n{'='*70}")
        print("‚úÖ FINAL PUN TRANSLATION")
        print(f"{'='*70}")
        print(f"   French pun word: {candidate.pun_word}")
        print(f"   Confidence: {candidate.confidence:.2f}")
        print(f"   Path: {' ‚Üí '.join(candidate.path)}")
        print(f"   Explanation: {candidate.explanation}")
        print(f"{'='*70}")

        french_sentence = translator.translate_sentence(sentence, pun_word, candidate.pun_word)
        print(f"\nüìå Full sentence: {french_sentence}")
        return french_sentence

    print(f"\n{'='*70}")
    print("‚ö†Ô∏è  FALLBACK (NO PUN FOUND)")
    print(f"{'='*70}")
    print(f"   Strategy: {fallback.strategy}")
    print(f"   Explanation: {fallback.explanation}")
    print(f"{'='*70}")

    translator2 = RealBilingualDict('en','fr')
    words = sentence.split()
    return " ".join(translator2.translate(re.sub(r"[^\w']", "", w.lower()))[0] for w in words)

## üöÄ Demo

1) Download Lexique 3 from lexique.org (free).
2) Upload the TSV/CSV into your environment.
3) Set `LEXIQUE_PATH` below to the uploaded file path.


In [None]:
'''
Summary: example run #1

This cell sets the Lexique file path and runs the full pipeline on example #1
'''


# --- Set this to your local Lexique 3 file path (TSV/CSV) ---
LEXIQUE_PATH = "/content/Lexique383.tsv"  # <-- update this path

translate_pun_complete("Time flies like an arrow.", lexique_path=LEXIQUE_PATH)

In [None]:
'''
Summary: example run #2

This cell sets the Lexique file path and runs the full pipeline on example #2
'''

translate_pun_complete("I went to the bank to watch the river bank.", lexique_path=LEXIQUE_PATH)

## üéØ Interactive Form

Set `LEXIQUE_PATH` once, then try different sentences.


In [None]:
'''
Summary: Colab-style UI form

This cell creates a simple form input so you can paste any sentence and run 
the same pipeline interactively with optional detailed logs.

If you want, I can also give you one sentence that explains the ‚Äúpolygon‚Äù 
idea in plain English (because that‚Äôs usually what people ask about first).
'''


# @title üé≠ Enter Your Pun (Scalable) { display-mode: "form" }

LEXIQUE_PATH = "/content/Lexique383.tsv"  # @param {type:"string"}
pun_sentence = "I went to the bank to watch the river bank." # @param {type:"string"}
show_detailed_output = True # @param {type:"boolean"}

result = translate_pun_complete(pun_sentence, lexique_path=LEXIQUE_PATH, show_details=show_detailed_output)