# Clustering de documents

## Imports

In [None]:
import collections
import os
import string
import sys

import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from pprint import pprint
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import cosine

In [None]:
import nltk

nltk.download('punkt')

In [None]:
data_path = "data"

## Charger tous les  fichiers et en créer une liste de textes

In [None]:
!rm data_path"_README.txt"
files = [f for f in sorted(os.listdir(data_path)) if not f.startswith('.') and f.endswith('.txt')]
len(files)


In [None]:
texts = []
for f in files:
    try:
        with open(os.path.join(data_path, f), "r", encoding="utf-8") as file:
            texts.append(file.read())
    except FileNotFoundError:
        print(f"Erreur : Le fichier {f} n'a pas été trouvé dans le chemin {data_path}")

## Vectoriser les documents à l'aide de TF-IDF

In [None]:
# Création d'une fonction de pré-traitement
def preprocessing(text, stem=True):
    """ Tokenize text and remove punctuation """
    text = text.translate(string.punctuation)
    tokens = word_tokenize(text)
    return tokens

### Instancier le modèle TF-IDF avec ses arguments

In [None]:
vectorizer = TfidfVectorizer(
    tokenizer=preprocessing,
    stop_words=stopwords.words('french'),
    max_df=0.5,
    min_df=0.1,
    lowercase=True)

### Construire la matrice de vecteurs à l'aide de la fonction `fit_transform`

In [None]:
tfidf_vectors = vectorizer.fit_transform(texts)

In [None]:
# Détail de la matrice
tfidf_vectors

### Imprimer le vecteur tf-IDF du premier document

In [None]:
pd.Series(
    tfidf_vectors[0].toarray()[0],
    index=vectorizer.get_feature_names_out()
    ).sort_values(ascending=False)

## Comprendre les vecteurs et leurs "distances"

In [None]:
cosine([1, 2, 3], [1, 2, 3])

In [None]:
cosine([1, 2, 3], [1, 2, 2])

In [None]:
cosine([1, 2, 3], [2, 2, 2])

### Tests sur nos documents

In [None]:
tfidf_array = tfidf_vectors.toarray()

In [None]:
# Vecteur du document 0
tfidf_array[0]

In [None]:
# Vecteur du document 1
tfidf_array[1]

In [None]:
cosine(tfidf_array[0], tfidf_array[1])

## Appliquer un algorithme de clustering sur les vecteurs TF-IDF des documents

Pour en savoir plus sur le KMeans clustering :
- https://medium.com/dataseries/k-means-clustering-explained-visually-in-5-minutes-b900cc69d175

### Définir un nombre de clusters

In [None]:
N_CLUSTERS = 3

### Instancier le modèle K-Means et ses arguments

In [None]:
km_model = KMeans(n_clusters=N_CLUSTERS)

### Appliquer le clustering à l'aide de la fonction `fit_predict`

In [None]:
clusters = km_model.fit_predict(tfidf_vectors)

In [None]:
clustering = collections.defaultdict(list)

for idx, label in enumerate(clusters):
    clustering[label].append(files[idx])

In [None]:
pprint(dict(clustering))

## Visualiser les clusters

### Réduire les vecteurs à 2 dimensions à l'aide de l'algorithme PCA
Cette étape est nécessaire afin de visualiser les documents dans un espace 2D

https://fr.wikipedia.org/wiki/Analyse_en_composantes_principales

In [None]:
pca = PCA(n_components=2)
reduced_vectors = pca.fit_transform(tfidf_vectors.toarray())

In [None]:
reduced_vectors[:10]

### Générer le plot

In [None]:
x_axis = reduced_vectors[:, 0]
y_axis = reduced_vectors[:, 1]

plt.figure(figsize=(10,10))
scatter = plt.scatter(x_axis, y_axis, s=100, c=clusters)

# Ajouter les centroïdes
centroids = pca.transform(km_model.cluster_centers_)
plt.scatter(centroids[:, 0], centroids[:, 1],  marker = "x", s=100, linewidths = 2, color='black')

# Ajouter la légende
plt.legend(handles=scatter.legend_elements()[0], labels=set(clusters), title="Clusters")

# Exploration des clusters

In [None]:
# Initialisation des listes pour chaque cluster
cluster_contents = {0: [], 1: [], 2: []}

# Parcourir chaque cluster et ajouter le contenu des fichiers à la liste correspondante
for cluster, filenames in clustering.items():
    for filename in filenames:
        try:
            with open(os.path.join(data_path, filename), "r", encoding="utf-8") as file:
                cluster_contents[cluster].append(file.read())
        except FileNotFoundError:
            print(f"Erreur : Le fichier {filename} n'a pas été trouvé dans le chemin {data_path}")

# Vous avez maintenant trois listes : cluster_contents[0], cluster_contents[1], cluster_contents[2]
# Chaque liste contient le contenu des documents de son cluster respectif


### 1er cluster (cluster_0)

In [None]:
from collections import Counter
from wordcloud import WordCloud
import os
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from IPython.display import Image

In [None]:
# Stopwords (Idem que dans s1)
sw = stopwords.words("french")
sw += ["les", "plus", "cette", "fait", "faire", "être", "deux", "comme", "dont", "tout",
       "ils", "bien", "sans", "peut", "tous", "après", "ainsi", "donc", "cet", "sous",
       "celle", "entre", "encore", "toutes", "pendant", "moins", "dire", "cela", "non",
       "faut", "trois", "aussi", "dit", "avoir", "doit", "contre", "depuis", "autres",
       "van", "het", "autre", "jusqu", "ville", "rossel", "dem", "sach", "heures", "toutes", "hier", "très"
       , "etc", "leurs", "grandes", "lés", "ille", "quo", "peu", "bon", "vers", "grand", "puis", "mois", "personne",
       "devant", "beau", "mén", "elles", "toujours", "déjà", "avenue", "quatre", "fort", "jours", "aujourd",
       "car", "hier", "toute", "grands", "app", "matin", "bne", "bas", "adresser", "haut", "dés",
       "place", "rue", "Bruxelles", "DCM", "ecr", "jour","maison", "ans", "chez", "réf", "prendre",
       "mod", "pers", "suite", "Mme", "voir", "cause", "louer", "porte", "belle", "fille",
       "adr", "quart", "maisons", "rué", "fit", "seule", "agence", "plusieurs", "bonnes", "ceux", "quelques",
       "faits", "jeune", "cent", "vient", "point", "portées", "avant", "serv", "com", "jard",
       "enfant", "désire", "cuis", "Bruxelles", "bruxelles", "grande", "petit", "bons", "dcm", "soir",
       "près", "bonne", "demande", "prix", "fois", "dés", "vendre", "jamais", "chaussée", "bonne", "franc", "lieu", "rien", "quartier",
       "pris", "fois", "tranq", "pos", "vente", "bonne", "neuve", "cuisine", "servante", "mat", "écrire",
       "juillet", "notaire", "dès", "quand", "temps", "pet", "celui", "donné", "partie", "homme", "petite", "cours",
       "mme","part", "coup", "demi", "pens", "neuf", "ruo", "cinq", "ici", "cond", "ferme", "coucher", "demandé",
       "ferm", "comm", "notaires", "août", "prop", "alors", "mardi", "beaucoup", "enfants", "ventes", "lundi", "jeudi",
       "donner", "nuit", "chamb", "chaque", "dimanche", "dos", "uno", "garn", "lés", "fam", "jolie", "demain", "année",
       "petite", "fam", "uno", "celui", "contenant", "premier", "assez", "nouvelles", "beaucoup", "cond", "con",
       "PET", "vendredi", "quand", "nouveau", "gros", "dix", "mieux", "journ", "chaque", "cert", "dém", "dos",
       "vend", "chamb", "alors", "seulement", "mars", "delà", "trop", "courant", "demain", "brux",
       "dernier", "hon", "bel", "seul", "frais", "petits", "dernière", "mis", "diverses", "référ",
       "jne", "tant", "franco", "francs","quelque", "nouvelle", "vieux", "placé", "garni",
       "mal", "sachant", "six", "nommé", "propre", "avril", "coud", "juin", "octobre", "enf",
       "mis", "divers", "sait", "servi", "flam", "septembre", "frais", "hom", "bel", "hon",
       "ouvrir", "enfin", "février", "hui", "dessus", "lès", "janvier", "scs", "partout", "façon",
       "laquelle", "ste", "samedi", "mans", "années", "ment", "mai", "voici", "céder", "surtout",
       "mercredi", "parmi", "franç", "font", "ancien", "cond", "mlle", "jeunes", "beaux", "semaine",
       "garnie", "reçu", "trouve", "derrière", "ruc", "rest", "conn", "lit", "haute", "parce", "fin",
       "aucune", "aussitôt", "ouvr", "aucune", "donne", "certains", "ord", "peuvent", "loin", "décembre",
       "autant", "possible", "première", "prochains", "faite", "voilà", "rendre", "maintenant", "ensuite",
       "los", "novembre", "presque", "ailleurs", "longtemps", "veut", "heure", "cependant", "écrit", "pourrait",
        "comment", "mettre", "abord", "oui", "mêmes", "nombreux", "également", "malgré" ]
sw = set(sw)

In [None]:
# Ecrire tout le contenu dans un fichier temporaire
temp_path = 'data/tmp/'
if not os.path.exists(temp_path):
    os.mkdir(temp_path)
with open(os.path.join(temp_path, f'cluster_0.txt'), 'w', encoding='utf-8') as f:
    f.write(' '.join(cluster_0))

In [None]:
def clean_text(folder=None):
    if folder is None:
        input_path = f"cluster_0.txt"
        output_path = f"cluster_0_clean.txt"
    else:
        input_path = f"{folder}cluster_0.txt"
        output_path = f"{folder}/cluster_0_clean.txt"
    output = open(output_path, "w", encoding='utf-8')
    with open(input_path, 'r', encoding='utf-8') as f:
        text = f.read()
        words = nltk.wordpunct_tokenize(text)
        kept = [w.upper() for w in words if len(w) > 2 and w.isalpha() and w.lower() not in sw]
        kept_string = " ".join(kept)
        output.write(kept_string)
    return f'Output has been written in {output_path}!'

In [None]:
clean_text(folder=temp_path)

In [None]:
with open(os.path.join(temp_path, f'cluster_0_clean.txt'), 'r', encoding='utf-8') as f:
    after = f.read()

after[:500]

In [None]:
frequencies = Counter(after.split())
print(frequencies.most_common(10))

In [None]:
cloud = WordCloud(width=2000, height=1000, background_color='white').generate_from_frequencies(frequencies)

cloud.to_file(os.path.join(temp_path, f"*.png"))
Image(filename=os.path.join(temp_path, f"*.png"))