In [2]:
import os
import random
import re
import string

import nltk
import numpy as np
import pandas as pd

from nltk import word_tokenize
from nltk.corpus import stopwords


from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

nltk.download("stopwords")
nltk.download("punkt")

SEED = 42
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/daniel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/daniel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
corpus = [
    "The king rules the kingdom with strength.",
    "The king is a powerful and courageous man.",
    "The queen leads the palace with wisdom.",
    "The queen is an elegant and intelligent woman.",
    "The man works hard and stands by his ideas.",
    "The woman brings gentleness and perseverance.",
    "In the past, kings and queens were respected.",
    "The king loves war, but peace reigns thanks to the queen."
]

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[{}]".format(string.punctuation), " ", text)
    text = re.sub(r"\s+", " ", text)
    return text

def tokenize(text):
    return word_tokenize(text)

def remove_stopwords(tokens):
    return [t for t in tokens if t not in stopwords.words("english")]
    
def preprocess(text):
    text = clean_text(text)
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    return " ".join(tokens)

corpus = [preprocess(text) for text in corpus]

corpus


['king rules kingdom strength',
 'king powerful courageous man',
 'queen leads palace wisdom',
 'queen elegant intelligent woman',
 'man works hard stands ideas',
 'woman brings gentleness perseverance',
 'past kings queens respected',
 'king loves war peace reigns thanks queen']

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

class TextEmbeddingKMeans:
    def __init__(self, n_clusters=8, max_iter=1000, n_init=100, tol=1e-3):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.n_init = n_init
        self.tol = tol
        self.vectorizer = TfidfVectorizer()
        self.kmeans = MiniBatchKMeans(
            n_clusters=self.n_clusters,
            max_iter=self.max_iter,
            n_init=self.n_init,
            tol=self.tol,
            random_state=SEED
        )

    def fit(self, corpus):
        X = self.vectorizer.fit_transform(corpus)
        self.kmeans.fit(X)
        return self

    def predict(self, corpus):
        X = self.vectorizer.transform(corpus)
        return self.kmeans.predict(X)

    def fit_predict(self, corpus):
        self.fit(corpus)
        return self.predict(corpus)

    def get_cluster_centers(self):
        return self.kmeans.cluster_centers_

    def get_labels(self):
        return self.kmeans.labels_

    def get_inertia(self):
        return self.kmeans.inertia_

    def get_silhouette_score(self, corpus):
        X = self.vectorizer.transform(corpus)
        return silhouette_score(X, self.kmeans.labels_)

    def get_silhouette_samples(self, corpus):
        X = self.vectorizer.transform(corpus)
        return silhouette_samples(X, self.kmeans.labels_)

# Test the class with the corpus
text_embedding_kmeans = TextEmbeddingKMeans(n_clusters=3)
labels = text_embedding_kmeans.fit_predict(corpus)

print("Cluster labels:", labels)
print("Silhouette Score:", text_embedding_kmeans.get_silhouette_score(corpus))

Cluster labels: [2 2 1 1 2 1 0 1]
Silhouette Score: 0.03905656006634724


In [7]:
embedding_model = text_embedding_kmeans.vectorizer
embedding_model.fit(corpus)
embedding_king = embedding_model.transform(['king']).toarray()
embedding_queen = embedding_model.transform(['queen']).toarray()
embedding_man = embedding_model.transform(['man']).toarray()
embedding_woman = embedding_model.transform(['woman']).toarray()

print("Embedding pour 'reine algébrique':", embedding_king - embedding_man + embedding_woman)
print("Embedding pour 'reine':", embedding_queen)
print(embedding_king - embedding_man + embedding_woman == embedding_queen)


Embedding pour 'reine algébrique': [[ 0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0. -1.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.]]
Embedding pour 'reine': [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0.]]
[[ True  True  True  True  True  True  True False  True  True  True  True
  False  True  True  True  True  True False  True  True  True  True  True
   True  True  True  True False  True]]
