# Semantop


In [None]:
# Copyright 2025 Colin de Seroux alias Phenix333 (https://colindeseroux.fr)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## <span style="color:#10c0ff">Dependencies installation</span>


In [None]:
%pip install -r ../requirements.txt

## <span style="color:#10c0ff">Dependencies importation</span>


In [None]:
from gensim.models import KeyedVectors
from huggingface_hub import hf_hub_download
import nltk
from nltk.corpus import stopwords
import os
from os import path as osp
import random
import requests
import zipfile

## <span style="color:#10c0ff">Environment</span>


In [None]:

MODEL = "frWac_no_postag_no_phrase_700_skip_cut50"
DICO_DELA = "http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/dela-fr-public.zip?B1=T%E9l%E9charger"
DICO_HBENBEL = "https://raw.githubusercontent.com/hbenbel/French-Dictionary/refs/heads/master/dictionary"
HUGGINGFACE = "colindeseroux/semantop"
DICO_EXIST = osp.exists(f"../models/dico.dic")

## <span style="color:#10c0ff">Download models</span>


In [None]:
os.makedirs("../models", exist_ok=True)

### <span style="color:#10ff41">Download embedding model</span>


In [None]:
if osp.exists(f"../models/{MODEL}.bin"):
    print(f"File ../models/{MODEL}.bin already exists.")
else:
    with open(f"../models/{MODEL}.bin", "wb") as f:
        f.write(requests.get(f"https://embeddings.net/embeddings/{MODEL}.bin").content)
    
    print(f"File ../models/{MODEL} downloaded successfully.")

### <span style="color:#10ff41">Download correct word files</span>


In [None]:
if not DICO_EXIST:
    with open("../models/dico.zip", "wb") as f:
        f.write(requests.get(DICO_DELA).content)

    with zipfile.ZipFile("../models/dico.zip", "r") as zip_ref:
        with zip_ref.open("dela-fr-public.dic") as source, open("../models/dela-dico.dic", "wb") as target:
            target.write(source.read())
    
    print(f"File ../models/dela-dico.dic downloaded successfully.")

In [None]:
files = ["adj", "adv", "noun", "verb"]

if not DICO_EXIST:
    for f in files:
        response = requests.get(f"{DICO_HBENBEL}/{f}.csv")
        
        with open(f"../models/{f}.csv", "w", encoding="utf-8") as f:
            f.write(response.text)

In [None]:
if not DICO_EXIST and not osp.exists(f"../models/custom.dic"):
    try:
        hf_hub_download(
            repo_id=HUGGINGFACE,
            filename="custom.dic",
            local_dir="../models",
            local_dir_use_symlinks=False
        )
    except Exception as e:
        print(f"Error downloading custom.dic: {e}")

## <span style="color:#10c0ff">Preprocessing the dico</span>


### <span style="color:#10ff41">Load correct_words if `dico.dic` exist</span>


In [None]:
correct_words = []

if DICO_EXIST:
    print(f"File ../models/dico.dic already exists.")
    
    with open("../models/dico.dic", "r", encoding="utf-8") as f:
        correct_words = [word.replace("\n", "") for word in f]
    
    print(f"Loaded {len(correct_words)} words from dico.dic")

### <span style="color:#10ff41">Preprocessing `dela-dico.dic` to keep only the base of words</span>


In [None]:
if not DICO_EXIST:
    nouns = []
    verbs = []
    adjectives = []

    with open("../models/dela-dico.dic", "r", encoding="utf-16") as f:
        for line in f:
            try:
                # Strip whitespace and split on comma
                line = line.strip()
                line.replace("\\-", "-")
                
                # Only singular words or base forms
                if ":ms" in line or ":fs" in line or line.endswith(":W"):
                    # Extract word and its classification
                    if ",.N" in line and "NPropre" not in line:
                        word = line.split(",")[0]
                        nouns.append(word)
                    elif ",.V" in line:
                        word = line.split(",")[0]
                        verbs.append(word)
                    elif ",.A" in line:
                        word = line.split(",")[0]
                        adjectives.append(word)
            except Exception as e:
                continue

    correct_words = nouns + verbs + adjectives

    print(f"Found {len(adjectives)} adjectives, {len(nouns)} nouns and {len(verbs)} verbs ")
    print(f"Total correct words in dico.dic: {len(correct_words)}")

### <span style="color:#10ff41">Preprocessing `adj.csv`, `adv.csv`, `noun.csv` and `verb.csv` to keep only the base of words</span>


In [None]:
if not DICO_EXIST:
    words = {
        "adj": 0,
        "adv": 0,
        "noun": 0,
        "verb": 0
    }
    
    for f_n in files:
        with open(f"../models/{f_n}.csv", "r", encoding="utf-8") as f:
            lines = f.readlines()
            
            matching_lines = []
            
            for line in lines:
                line = line.strip()
                
                if f_n == "verb":
                    if line.endswith(",['infinitive']"):
                        matching_lines.append(line.replace(",['infinitive']", ""))
                else:
                    if line.endswith(","):
                        matching_lines.append(line.replace(",", ""))
            
            words[f_n] = len(matching_lines)
            
            correct_words.extend(matching_lines)
    
    print(f"Found {words['adj']} adjectives, {words['adv']} adverbs, {words['noun']} nouns and {words['verb']} verbs")
    print(f"Total correct words in adj.csv + adv.csv + noun.csv + verb.csv: {words['adj'] + words['adv'] + words['noun'] + words['verb']}")            
    print(f"Total correct words in dico.dic: {len(correct_words)}")

### <span style="color:#10ff41">Add custom words</span>


In [None]:
if not DICO_EXIST:
    with open(f"../models/custom.dic", "r", encoding="utf-8") as f:
        lines = f.readlines()
        correct_words.extend([word.replace("\n", "") for word in lines])
    
    print(f"Total correct words in dico.dic: {len(correct_words)}")

### <span style="color:#10ff41">Delete short words (not useful in me yousecase)</span>


In [None]:
if not DICO_EXIST:
    correct_words = [word for word in correct_words if len(word) >= 3]
    # Remove duplicates from the correct_words list to ensure uniqueness
    correct_words = list(set(correct_words))
    print(f"Total correct words in dico.dic: {len(correct_words)}")

## <span style="color:#10c0ff">Create the custom model</span>


### <span style="color:#10ff41">Load the model</span>


In [None]:
model = KeyedVectors.load_word2vec_format(f"../models/{MODEL}.bin", binary=True, unicode_errors="ignore")

### <span style="color:#10ff41">Download / load french stopwords from nltk</span>


In [None]:
if not DICO_EXIST:
    nltk.download("stopwords")
    french_stopwords = set(stopwords.words("french"))

### <span style="color:#10ff41">Delete stopwords</span>


In [None]:
if not DICO_EXIST:
    filtered_keys_without_stopwords = [word for word in model.index_to_key if word not in french_stopwords]
    filtered_keys_without_stopwords.append("été")
    print(f"Filtered keys without stopwords: {len(filtered_keys_without_stopwords)}")

### <span style="color:#10ff41">Keep only correct words if they exist in the model</span>


In [None]:
if not DICO_EXIST:
    correct_filtered_keys = [word for word in filtered_keys_without_stopwords if word in correct_words]
    print(f"Correct filtered keys: {len(correct_filtered_keys)}")
    
    correct_words = correct_filtered_keys

Just in case you re-launch the creation of a new model without paying attention.


In [None]:
if not DICO_EXIST:
    with open("../models/dico.dic", "w", encoding="utf-8") as f:      
        for word in correct_filtered_keys:
            f.write(word + "\n")

### <span style="color:#10ff41">Save the model</span>


In [None]:
if not osp.exists(f"../models/{MODEL}_custom.bin"):
    custom_model = KeyedVectors(vector_size=model.vector_size)
    custom_model.add_vectors(correct_words, [model[word] for word in correct_words])
    custom_model.save_word2vec_format(f"../models/{MODEL}_custom.bin", binary=True)

## <span style="color:#10c0ff">Test the custom model</span>


You can download directely the custom model on [HuggingFace](https://huggingface.co/datasets/colindeseroux/semantop).


In [None]:
if not osp.exists(f"../models/{MODEL}_custom.bin"):
    try:
        hf_hub_download(
            repo_id=HUGGINGFACE,
            filename=f"{MODEL}_custom.bin",
            local_dir="../models",
            local_dir_use_symlinks=False
        )
    except Exception as e:
        print(f"Error downloading {MODEL}_custom.bin: {e}")

### <span style="color:#10ff41">Load the model</span>


In [None]:
custom_model = KeyedVectors.load_word2vec_format(f"../models/{MODEL}_custom.bin", binary=True, unicode_errors="ignore")

### <span style="color:#10ff41">Get random word</span>


In [None]:
random_word = random.choice(list(custom_model.index_to_key))
print(f"Random word from the model: {random_word}")

### <span style="color:#10ff41">Get similar words</span>


In [None]:
similar_words = custom_model.similar_by_word(random_word, topn=10)

print("Most similar words:")

for word, similarity in similar_words:
    print(f"{word}: {similarity:.4f}")

### <span style="color:#10ff41">Test word</span>


In [None]:
similarity = custom_model.similarity(random_word, "chat")
print(f"Similarity between '{random_word}' and 'chat': {similarity:.4f}")

### <span style="color:#10ff41">Bot</span>


Test the model with assert in reverse.


In [None]:
if "cbow" in MODEL:
    TARGETS = {
        "chat": -0.0195,
        "minou": 0.0167
    }
else:
    TARGETS = {
        "chat": 0.0374,
        "minou": 0.0350
    }


def distance_scores(word: str, vectors: list, targets: list, model: KeyedVectors) -> float:
    """
    Calculate the cumulative squared error between the similarity of a word and target values.
    
    :param word: The word to evaluate.
    :type word: str
    :param vectors: List of reference words to compare against.
    :type vectors: list
    :param targets: List of target similarity values corresponding to the reference words.
    :type targets: list
    :param model: The word embedding model.
    :type model: KeyedVectors
    
    :return: Cumulative squared error.
    :rtype: float
    """
    
    if word not in model:
        return float("inf")
    
    score = 0.0
    
    for word_ref, target in zip(vectors, targets):
        sim = model.similarity(word, word_ref)
        score += (sim - target) ** 2
        
    return score

ref_words = list(TARGETS.keys())
target_values = list(TARGETS.values())

scores = []

for word in custom_model.index_to_key:
    if word in ref_words:
        continue

    dist = distance_scores(word, ref_words, target_values, custom_model)
    scores.append((word, dist))

scores = sorted(scores, key=lambda x: x[1])

assert scores[0][0] == "colin" # If not True, the model is not working as expected

print("Words closest to the combination :")

for word, error in scores[:10]:
    print(f"{word} (cumulative squared error: {error:.4f})")