# Text mining

## Appuie pour faire tourner les fonctions

### Config

Les modules à installer se trouvent dans requirements.txt

Import and download

In [None]:
import json
import os
import re
import csv
import string
import nltk 
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import defaultdict
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('punkt_tab')

Créer les repertoires outputs

In [None]:
def creer_repertoires():
    reps = ['output', 'output/clusters', 'cluster_links', 'outputgml', 'outputLA']
    for rep in reps:
        os.makedirs(rep, exist_ok=True)
    print("Répertoires crées.")

# Call the function to create the directories
creer_repertoires()

Configuration

In [2]:
STOP_WORDS = set(stopwords.words('english')) | set(string.punctuation)
STEMMER = nltk.stem.SnowballStemmer('english')
SIA = SentimentIntensityAnalyzer()

### Fonctions utilitaires

Chargement du fichier de contenu

In [3]:
def load_data(content_path, links_path):
    if not os.path.exists(content_path):
        raise FileNotFoundError(f"Le fichier '{content_path}' est introuvable.")
    with open(content_path, 'r', encoding='utf-8') as file:
        content = json.load(file)
    if not os.path.exists(links_path):
        raise FileNotFoundError(f"Le fichier '{links_path}' est introuvable.")
    with open(links_path, 'r', encoding='utf-8') as file:
        links = json.load(file)
    
    return content, links

Sauvegarde des données

In [4]:
def save_to_file(data, filename):
    if isinstance(data, np.ndarray):
        np.savetxt(f'output/{filename}.txt', data, fmt='%.4f')
    elif isinstance(data, dict):
        with open(f'output/{filename}.json', 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
    elif isinstance(data, list):
        with open(f'output/{filename}.txt', 'w', encoding='utf-8') as f:
            for line in data:
                f.write(line + '\n')

JSON TO GML

In [5]:
def json_to_gml (links_path, output_file):
    with open(links_path, "r", encoding="utf-8") as file:
        links_path = json.load(file)
    noeuds = set()
    aretes = []
    for source, cible in links_path.items():
        noeuds.add(source)
        for cible in cible:
            noeuds.add(cible)
            aretes.append((source, cible))
    noeuds_id = {}
    id = 0
    for noeud in noeuds:
        noeuds_id[noeud] = id
        id += 1
    with open(output_file, "w", encoding="utf-8") as file:
        file.write("graph\n")
        file.write("[\n")
        file.write("  directed 1\n")
        #noeuds
        for noeud, id in noeuds_id.items():
            file.write("  node\n")
            file.write("  [\n")
            file.write(f"    id {id}\n")
            file.write(f"    label \"{noeud}\"\n")
            file.write("  ]\n")
        #arêtes
        for source, cible in aretes:
            file.write("  edge\n")
            file.write("  [\n")
            file.write(f"    source {noeuds_id[source]}\n")
            file.write(f"    target {noeuds_id[cible]}\n")
            file.write("  ]\n")
        file.write("]\n")
    print(f"Fichier GML créé : {output_file}")

Traitement du contenu texte

In [6]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [STEMMER.stem(token) for token in tokens if token not in STOP_WORDS and len(token) > 2]
    return tokens

Sauvegarder tous les tokens

In [7]:
def clean_and_save_tokens(content, output_file):
    cleaned_data = {}

    for page_title, page_content in content.items():
        tokens = preprocess_text(page_content)
        cleaned_data[page_title] = tokens
    save_to_file(cleaned_data, output_file)

Traitement de texte spécifique à SIA, pour éviter de supprimer les "not" et autres

In [8]:
def preprocess_SIA(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    return tokens

Liste des tokens

In [9]:
def tokenize_corpus(corpus):
    tokens = []
    for content in corpus.values():
        tokens.extend(preprocess_text(content))
    return tokens

Entrainement des modèles tfidfj et doc2vec

In [10]:
# Entrainement du modèle Doc2Vec
def train_doc2vec_model(documents, vector_size=100, window=1, epochs=20):
    tagged_data = [TaggedDocument(words=word_tokenize(doc.lower()), tags=[str(i)]) for i, doc in enumerate(documents)]
    model = Doc2Vec(tagged_data, vector_size=vector_size, window=2, min_count=1, workers=4, epochs=epochs)
    return model

# Entrainement du modèle TFIDF
def train_tfidf_model(documents):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(documents)
    return tfidf_matrix

### Différents modèles de text mining

Trouver les phrases comprenant un ou des mots

In [11]:
def find_sentences_with_words(corpus, words):
    sentences_with_words = []
    words_stem = [STEMMER.stem(word.lower()) for word in words]
    
    for content in corpus.values():
        sentences = sent_tokenize(content)
        
        for sentence in sentences:
            tokens =word_tokenize(sentence.lower())
            stemmed_tokens =[STEMMER.stem(token) for token in tokens]
            if all(word in stemmed_tokens for word in words_stem):
                sentences_with_words.append(sentence)
    
    return sentences_with_words

Analyse de sentiments sur des phrases

In [12]:
def analyze_word_sentiment(sentences):
    sia =SentimentIntensityAnalyzer()
    sentiment_scores ={"positive": 0, "negative": 0, "neutral": 0, "compound": 0}
    for sentence in sentences:
        scores =sia.polarity_scores(sentence)
        sentiment_scores["positive"] += scores["pos"]
        sentiment_scores["negative"] += scores["neg"]
        sentiment_scores["neutral"] += scores["neu"]
        sentiment_scores["compound"] += scores["compound"]
    # Moyenne des scores
    num_sentences =len(sentences)
    if num_sentences > 0:
        sentiment_scores ={key: value / num_sentences for key, value in sentiment_scores.items()}
    # Déterminer le sentiment principal
    if sentiment_scores["positive"] > sentiment_scores["negative"] and sentiment_scores["positive"] > sentiment_scores["neutral"]:
        sentiment ="Positif"
    elif sentiment_scores["negative"] > sentiment_scores["positive"] and sentiment_scores["negative"] > sentiment_scores["neutral"]:
        sentiment ="Négatif"
    else:
        sentiment ="Neutre"

    return sentiment_scores, sentiment

Combinaison d'analyse de sentiments et de trouver les phrases avec un mot

In [13]:

def evaluate_word_sentiment(content, word):
    print(word)
    #Trouver les phrases contenant le mot
    sentences_with_word=find_sentences_with_words(content, word)
    print(sentences_with_word)
    if not sentences_with_word:
        print(f"Le mot '{word}' n'a pas été trouvé dans le corpus.")
        return {"word": word,"sentences_with_word": [], "sentiment_scores": {"positive": 0, "negative": 0, "neutral": 0, "compound": 0},"overall_sentiment": "Aucun (mot non trouvé)"}
    # Analyser le sentiment des phrases
    sentiment_scores, sentiment = analyze_word_sentiment(sentences_with_word)

    return {"word": word,"sentences_with_word": sentences_with_word,"sentiment_scores": sentiment_scores,"overall_sentiment": sentiment}

Analyse de sentiments sur une page

In [14]:

def page_sentiment(page_name, content):
    if page_name not in content:
        return f"La page '{page_name}' n'existe pas dans le corpus."
    page_content=content[page_name]
    sentences=sent_tokenize(page_content)
    sentiment_scores,general_sent =analyze_word_sentiment(sentences)

    # Ajout pour cas neutre avec consonance
    if general_sent== "Neutre":
        if sentiment_scores["positive"]>sentiment_scores["negative"]:
            general_sent = "Neutre avec consonance plus positive"
        elif sentiment_scores["negative"]>sentiment_scores["positive"]:
            general_sent= "Neutre avec consonance plus négative"

    return {"page_name": page_name,"sentiment_scores": sentiment_scores,"overall_sentiment": general_sent}


Analyse de sentiments sur un ensemble de page

In [15]:
def analyze_corpus_sentiment(corpus):
    general_sent = {"positive": 0, "negative": 0, "neutral": 0, "compound": 0}
    document_sentiments = {}
    for doc_name, doc_content in corpus.items():
        sentences = sent_tokenize(doc_content)
        sentiment_scores, overall_sentiment = analyze_word_sentiment(sentences)
        document_sentiments[doc_name] = {"sentiment_scores": sentiment_scores,"overall_sentiment": overall_sentiment}

        #Accumuler les scores de sentiment pour l'ensemble du corpus
        for key in general_sent:
            general_sent[key] += sentiment_scores[key]
    # Moyenne des scores pour l'ensemble du corpus
    num_documents = len(corpus)
    if num_documents > 0:
        general_sent = {key: value / num_documents for key, value in general_sent.items()}

    return {"overall_sentiment_scores": general_sent,"document_sentiments": document_sentiments}

Bigrammes

In [16]:
def find_cooccurrences(corpus_path, keyword, min_freq=2, window_size=5):
    with open(corpus_path, 'r', encoding='utf-8') as file:
        corpus = json.load(file)
    all_tokens = [] 
    for text in corpus.values():
        all_tokens.extend(preprocess_text(text))
    #Trouver les bigrammes
    bigram_measures= BigramAssocMeasures()
    finder=BigramCollocationFinder.from_words(all_tokens, window_size=window_size)
    finder.apply_freq_filter(min_freq)
    
    #Filtrer les bigrammes avce le mot-clé
    keyword_stem=STEMMER.stem(keyword)
    bigrams={}
    for bigram, freq in finder.ngram_fd.items():
        if keyword_stem in bigram:
            distances=[]
            for i in range(len(all_tokens) - 1):
                if all_tokens[i]==bigram[0] and bigram[1] in all_tokens[i + 1:i + window_size + 1]:
                    j=all_tokens.index(bigram[1], i + 1, i + window_size + 1)
                    distances.append(abs(j - i))
            bigrams[bigram]={"frequency": freq,"mean_distance": sum(distances) / len(distances) if distances else 0}
    #Trier les résultats
    sorted_bigrams=sorted(bigrams.items(), key=lambda item: item[1]['frequency'], reverse=True)
    print(f"Cooccurrences pour le mot-clé {keyword}:\n")
    print(f"{'Cooccurrence':<20}\t{'Frequency':<10}\t{'Mean Distance':<15}")
    print(f"{'-' * 50}")
    for bigram, data in sorted_bigrams:
        print(f"{bigram[0]} {bigram[1]:<17}\t{data['frequency']:<10}\t{data['mean_distance']:<15.2f}")
    return sorted_bigrams

Trigrammes

In [17]:
def find_trigram_cooccurrences(corpus_path, keyword, min_freq=2, window_size=5):
    with open(corpus_path, 'r', encoding='utf-8') as file:
        corpus=json.load(file)
    all_tokens=[]
    for text in corpus.values():
        all_tokens.extend(preprocess_text(text))
    trigram_measures =nltk.collocations.TrigramAssocMeasures()
    finder =nltk.collocations.TrigramCollocationFinder.from_words(all_tokens, window_size=window_size)
    finder.apply_freq_filter(min_freq)
    # Filtrer les trigrammes contenant le mot-clé
    keyword_stem=STEMMER.stem(keyword)
    trigrams={}
    for trigram, freq in finder.ngram_fd.items():
        if keyword_stem in trigram:
            distances=[]
            for i in range(len(all_tokens) - 2):
                if all_tokens[i] == trigram[0] and trigram[1] in all_tokens[i + 1:i + window_size + 1] and trigram[2] in all_tokens[i + 2:i + window_size + 2]:
                    j=all_tokens.index(trigram[1], i + 1, i + window_size + 1)
                    k=all_tokens.index(trigram[2], i + 2, i + window_size + 2)
                    distances.append(abs(k - i))
            trigrams[trigram]={"frequency": freq,"mean_distance": sum(distances) / len(distances) if distances else 0} 
    # Trier les résultats
    sorted_trigrams=sorted(trigrams.items(), key=lambda item: item[1]['frequency'], reverse=True)
    # Afficher les résultats
    print(f"Cooccurrences pour le mot-clé '{keyword}':\n")
    print(f"{'Cooccurrence':<30}\t{'Frequency':<10}\t{'Mean Distance':<15}")
    print(f"{'-' * 60}")
    for trigram, data in sorted_trigrams:
        print(f"{trigram[0]} {trigram[1]} {trigram[2]:<17}\t{data['frequency']:<10}\t{data['mean_distance']:<15.2f}")
    return sorted_trigrams

Analyse des tokens

In [18]:
def show_top_tokens(corpus, n=20):
    all_tokens = tokenize_corpus(corpus)
    freq_dist = nltk.FreqDist(all_tokens)
    return freq_dist.most_common(n)

Wordcloud

In [19]:
def wordcloud(corpus, output_filename='wordcloud.png'):
    all_tokens = tokenize_corpus(corpus)
    text = ' '.join(all_tokens)
    wordcloud = WordCloud(background_color='white', stopwords=STOP_WORDS, max_words=30, min_font_size=10).generate(text)
    plt.figure(figsize=(8, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.savefig(f'output/{output_filename}', format='png')
    plt.show()

Tableau des tops tokens 

In [20]:
def token_tableau(json_path, output_csv):
    with open(json_path, 'r', encoding='utf-8') as file:
        content=json.load(file)
    token_counts={} #Dictionnaire pour les occurrences
    document_counts=defaultdict(int)  #Nombre de documents contenant chaque token
    mots_assoc=defaultdict(set)  #Mots originaux associés aux tokens stemmés
    for text in content.values():
        tokens=word_tokenize(text.lower())
        stemmed_tokens=[STEMMER.stem(token) for token in tokens if token.isalnum() and token not in STOP_WORDS]

        #Compter les occurrences globales
        for token in stemmed_tokens:
            if token in token_counts:
                token_counts[token]+= 1
            else:
                token_counts[token]=1

        #Compter les documents contenant chaque token (uniquement une fois par document)
        unique_tokens = set(stemmed_tokens)
        for token in unique_tokens:
            document_counts[token]+=1

        #Garder les mots originaux aux tokens stemmés
        for word in tokens:
            if word.isalnum() and word not in STOP_WORDS:
                stemmed = STEMMER.stem(word)
                mots_assoc[stemmed].add(word)

    # Construire et sauvegarder le tableau en une étape
    os.makedirs(os.path.dirname(output_csv), exist_ok=True)
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer=csv.writer(csvfile)
        # Écrire l'en-tête
        writer.writerow(["Token (stemmatisé)", "Mots associés", "Occurrences", "Articles concernés"])
        # Écrire les données
        for token, freq in token_counts.items():
            writer.writerow([token,', '.join(mots_assoc[token]),freq,document_counts[token]])
    print(f"Tableau sauvegardé dans {output_csv}")


Calcul de similarité entre 2 documents au choix


In [21]:
def calculate_similarity(model, doc_id1, doc_id2):
    vec1 = model.dv[doc_id1]
    vec2 = model.dv[doc_id2]
    return cosine_similarity([vec1], [vec2])[0][0]

Recherche de documents similaires

In [22]:
def find_similar_docs_tfidf(content, tfidf_matrix, title, n=4):
    if title not in content:
        raise ValueError(f"Le titre '{title}' n'existe pas dans le contenu.")

    doc_index = list(content.keys()).index(title)
    cosine_similarities = cosine_similarity(tfidf_matrix[doc_index], tfidf_matrix).flatten()
    similar_indices = cosine_similarities.argsort()[::-1][1:n+1]
    similar_docs = [(list(content.keys())[i], cosine_similarities[i]) for i in similar_indices]
    output_file = os.path.join(f"output/{title}.txt")
    
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(f"Document de base: {title}\n")
        file.write(content[title] + "\n\n")
        
        for doc_title, similarity in similar_docs:
            file.write(f"Document similaire: {doc_title} (Similarité: {similarity:.4f})\n")
            file.write(content[doc_title] + "\n\n")
    
    print(f"Documents sauvegardés dans {output_file}")
    return similar_docs


### Clustering

Matrice de similarité par cosinus

In [23]:
def calculate_cosine_similarity(content, model):
    # Obtenir les vecteurs des documents
    doc_vectors = np.array([model.dv[i] for i in range(len(content))]) 
    # Calcul de la matrice de similarité par le cosinus
    similarity_matrix = cosine_similarity(doc_vectors)
    # Sauvegarder la matrice de similarité dans un fichier
    save_to_file(similarity_matrix, 'similarity_matrix.json') 
    return similarity_matrix

Nombre optimal de clusters

In [24]:
def find_optimal_clusters(vectors, max_clusters=10, min_clusters=2):
    distortions=[]
    for k in range(min_clusters, max_clusters + 1):
        kmeans =KMeans(n_clusters=k, random_state=0)
        kmeans.fit(vectors)
        distortions.append(kmeans.inertia_)
    plt.figure(figsize=(8, 6))
    plt.plot(range(min_clusters, max_clusters + 1), distortions, marker='o')
    plt.xlabel('Nombre de clusters')
    plt.ylabel('Distortion')
    plt.title('Méthode du coude pour trouver le nombre optimal de clusters')
    plt.show()
    optimal_k =min_clusters
    for i in range(1, len(distortions) - 1):
        if distortions[i] - distortions[i + 1] < distortions[i - 1] - distortions[i]:
            optimal_k=min_clusters + i
            break
    print(f"Nombre optimal de clusters : {optimal_k}")
    return optimal_k

Clusters

In [25]:
def cluster_documents(content, optimal_k):

    # Préparer les documents
    documents = list(content.values())

    # Entraîner le modèle Doc2Vec
    model = train_doc2vec_model(documents, vector_size=100, window=2, epochs=40)

    # Obtenir les vecteurs des documents
    doc_vectors = [model.dv[i] for i in range(len(content))]

    # Appliquer KMeans
    kmeans =KMeans(n_clusters=optimal_k, max_iter=100, n_init=10, random_state=42)
    kmeans.fit(doc_vectors)

    clusters = {}
    for i, doc in enumerate(doc_vectors):
        cluster_label = int(kmeans.labels_[i])
        document_key = list(content.keys())[i]  # Utilisation de la clé du dictionnaire
        if cluster_label not in clusters:
            clusters[cluster_label] = []
        clusters[cluster_label].append(document_key)

    # Sauvegarder les clusters
    save_to_file(clusters, 'clusters')
    
    # Créer un fichier texte avec les titres des documents regroupés par clusters
    with open('output/clusters_titles.txt', 'w', encoding='utf-8') as f:
        for cluster_label, docs in clusters.items():
            f.write(f"Cluster {cluster_label}:\n")
            for doc in docs:
                f.write(f"- {doc}\n")
            f.write("\n")

    # Calcul et sauvegarde de la matrice de similarité
    similarity_matrix= calculate_cosine_similarity(content, model)

    return clusters, similarity_matrix

Sauvegarde les titres complets des documents dans chaque cluster dans un fichier

In [26]:
def save_clusters_individually(clusters, content_path):
    with open(content_path, 'r', encoding='utf-8') as f:
        content = json.load(f)
    # Créer un dossier pour stocker les fichiers des clusters
    os.makedirs('output/clusters', exist_ok=True)
    
    # Sauvegarder chaque cluster dans un fichier JSON séparé
    for num_cluster, docs in clusters.items():
        cluster_content = {doc: content[doc] for doc in docs}  # Documents de ce cluster
        output_file = f'output/clusters/cluster_{num_cluster}.json'  # Nom du fichier JSON
        
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(cluster_content, f, ensure_ascii=False, indent=4)
        print(f"Cluster {num_cluster} sauvegardé dans {output_file}")


Links par clusters

In [27]:
def create_links_per_cluster(clusters_dir, original_links_path, output_dir):
    with open(original_links_path, 'r', encoding='utf-8') as f:
        original_links = json.load(f)
    os.makedirs(output_dir, exist_ok=True)
    for cluster_file in os.listdir(clusters_dir):
        cluster_path = os.path.join(clusters_dir, cluster_file)
        with open(cluster_path, 'r', encoding='utf-8') as f:
            cluster_nodes = json.load(f)
        cluster_links = {}
        for node in cluster_nodes:
            if node in original_links:
                cluster_links[node] = [target for target in original_links[node] if target in cluster_nodes]

        cluster_label = os.path.splitext(cluster_file)[0]
        output_path = os.path.join(output_dir, f"{cluster_label}_links.json")
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(cluster_links, f, ensure_ascii=False, indent=4)

        print(f"Fichier de liens créé pour le cluster : {output_path}")

## Choisis ce que tu veux lancer

Chemin du fichier

In [31]:
if __name__ == '__main__':
    content_path = 'contentB.json'
    links_path = 'linksB.json'
    content, links = load_data(content_path, links_path)

entrainer le modèle

In [32]:
model = train_doc2vec_model(content, vector_size=100, window=5, epochs=50)
tfidf_matrix = train_tfidf_model(content)
doc_vectors = ([model.dv[i] for i in range(len(content))])

Phrases content un/des mots

In [None]:
words_to_find = ['diversity', 'inclusion', 'equity']
sentences = find_sentences_with_words(content, words_to_find)
with open('output/sentences_with_words.txt', 'w', encoding='utf-8') as file:
    pass
with open('output/sentences_with_words.txt', 'w', encoding='utf-8') as file:
    for sentence in sentences:
        file.write(sentence + '\n')
for sentence in sentences:
    print(sentence)

Analyse de sentiment pour des phrases contenant des mots choisis

In [None]:
result = evaluate_word_sentiment(content, ["performance", "inclusion", "communication"])

print(f"Analyse de sentiment pour le mot '{result['word']}':")
print(f"Scores moyens : {result['sentiment_scores']}")
print(f"Sentiment global : {result['overall_sentiment']}")
print("\nExemples de phrases contenant le mot :")
for sentence in result["sentences_with_word"][:5]:  # Afficher jusqu'à 5 phrases
    print(f"- {sentence}")

Analyse de sentiments pour 1 page

In [None]:
#analyse de sent. 1 page
page_name = "Minimum wage"
page_sentiment(page_name, content)

Analyse de sentiments pour un ensemble de page

In [None]:
#analyse de sent. corpus
corpus = "outputidm/cooperative.txt"
acs = analyze_corpus_sentiment(corpus)
print(acs)

Sauvegarder les tokens

In [82]:
output_file = 'cleaned_data'
clean_and_save_tokens(content, output_file)

Bigrammes

In [None]:
corpus_path = 'contentB.json'  
keyword = "white"         
find_cooccurrences(corpus_path, keyword, min_freq=47, window_size=5)

Trigrammes

In [None]:
corpus_path = 'contentB.json'  
keyword = "white"         
find_trigram_cooccurrences(corpus_path, keyword, min_freq=10, window_size=7)

Tokens fréquents

In [None]:
top_tokens = show_top_tokens(content)
print("Top Tokens:", top_tokens)

Wordcloud

In [None]:
wordcloud(content)

Generate Tokens Stats

In [None]:
output = 'output/token_table.csv'
token_tableau(content_path, output)

Similarité entre 2 docs /avec 1 doc

In [None]:
doc_id1 = '0'
doc_id2 = '1'
similarity = calculate_similarity(model, doc_id1, doc_id2)
print("Similarité entre doc 0 et doc 1 :", similarity)

Trouver les documents similaires

In [None]:
title = "Cooperative"
similar_docs_tfidf = find_similar_docs_tfidf(content, tfidf_matrix, title)
print(f"Documents similaires à '{title}':")
for doc, similarity in similar_docs_tfidf:
    print(f"- {doc} (similarité: {similarity:.4f})")

Clustering

Nombre optimal de clusters

In [None]:
doc_vectors = StandardScaler().fit_transform(doc_vectors)
tfidf_vectors = StandardScaler().fit_transform(tfidf_matrix.toarray())
combined_vectors = np.hstack((doc_vectors, tfidf_matrix.toarray()))

optimal_k = find_optimal_clusters(combined_vectors, max_clusters=10, min_clusters=4)

Créer les clusters individuellement

In [None]:
clusters, similarity_matrix = cluster_documents(content, optimal_k)
save_clusters_individually(clusters, "contentB.json")

Créer des liens pour chaque cluster

In [None]:
clusters_dir = "output/clusters"
original_links_path = "linksB.json"
output_dir = "cluster_links"

create_links_per_cluster(clusters_dir, original_links_path, output_dir)

Connaitre le nombre de doc par cluster

In [None]:
for cluster_label, docs in clusters.items():
    print(f"Cluster {cluster_label} contains {len(docs)} documents.")

JSON to GML

Graph principal

In [None]:
links = "linksB.json"
output_file = "outputgml/graph.gml"
json_to_gml(links, output_file)

Graph pour les tokens individuels, pour les clusters // NOT USED

In [None]:
output_file1 = "outputgml/tokens_graph.gml"
output_file2 = "outputgml/clusters_graph.gml"
tokens_file = "output/cleaned_data.json"
clusters_file = "output/clusters.json"
json_to_gml(tokens_file, output_file1)
json_to_gml(clusters_file, output_file2)

Graph pour chaque cluster 

In [None]:
num_clusters = find_optimal_clusters(combined_vectors, max_clusters=14, min_clusters=4)
for i in range(0,num_clusters):
    json_to_gml(f"cluster_links/cluster_{i}_links.json", f"outputgml/cluster_{i}.gml")