# Text mining

## Appuie pour faire tourner les fonctions

### Config

Les modules à installer se trouvent dans requirements.txt

Import and download

In [None]:
import json
import os
import re
import csv
import string
import nltk 
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import defaultdict
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('punkt_tab')

Configuration

In [55]:
STOP_WORDS = set(stopwords.words('english')) | set(string.punctuation)
STOP_WORDS.update(['employee', 'organization', 'work', 'job', 'company', "'s"])
STEMMER = nltk.stem.SnowballStemmer('english')
SIA = SentimentIntensityAnalyzer()

### Fonctions utilitaires

Chargement du fichier de contenu

In [56]:
def load_data(content_path, links_path):
    if not os.path.exists(content_path):
        raise FileNotFoundError(f"Le fichier '{content_path}' est introuvable.")
    with open(content_path, 'r', encoding='utf-8') as file:
        content = json.load(file)
    if not os.path.exists(links_path):
        raise FileNotFoundError(f"Le fichier '{links_path}' est introuvable.")
    with open(links_path, 'r', encoding='utf-8') as file:
        links = json.load(file)
    
    return content, links

Sauvegarde des données

In [57]:
os.makedirs('output', exist_ok=True)

def save_to_file(data, filename):
    if isinstance(data, np.ndarray):
        np.savetxt(f'output/{filename}.txt', data, fmt='%.4f')
    elif isinstance(data, dict):
        with open(f'output/{filename}.json', 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
    elif isinstance(data, list):
        with open(f'output/{filename}.txt', 'w', encoding='utf-8') as f:
            for line in data:
                f.write(line + '\n')


JSON TO GML

In [58]:
def json_to_gml (links_path, output_file):
    with open(links_path, "r", encoding="utf-8") as file:
        links_path = json.load(file)
    nodes = set()
    edges = []
    # Parcourir les liens et extraire les nœuds et les connexions
    for source, targets in links_path.items():
        nodes.add(source)
        for target in targets:
            nodes.add(target)
            edges.append((source, target))

    # Étape 3 : Assigner des ID aux nœuds (sans utiliser enumerate)
    node_id = {}
    current_id = 0
    for node in nodes:
        node_id[node] = current_id
        current_id += 1

    # Étape 4 : Écrire le fichier GML
    with open(output_file, "w", encoding="utf-8") as file:
        file.write("graph\n")
        file.write("[\n")
        file.write("  directed 1\n")  # Graph orienté

        # Ajouter les nœuds
        for node, id in node_id.items():
            file.write("  node\n")
            file.write("  [\n")
            file.write(f"    id {id}\n")
            file.write(f"    label \"{node}\"\n")
            file.write("  ]\n")

        # Ajouter les arêtes
        for source, target in edges:
            file.write("  edge\n")
            file.write("  [\n")
            file.write(f"    source {node_id[source]}\n")
            file.write(f"    target {node_id[target]}\n")
            file.write("  ]\n")

        file.write("]\n")

    print(f"Fichier GML créé : {output_file}")

Traitement du contenu texte

In [59]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [STEMMER.stem(token) for token in tokens if token not in STOP_WORDS and len(token) > 2]
    return tokens

Sauvegarder tous les tokens

In [60]:
def clean_and_save_tokens(content, output_file):
    cleaned_data = {}

    for page_title, page_content in content.items():
        tokens = preprocess_text(page_content)
        cleaned_data[page_title] = tokens

    save_to_file(cleaned_data, output_file)

Traitement de texte spécifique à SIA, pour éviter de supprimer les "not" et autres

In [61]:
def preprocess_SIA(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    return tokens

Liste des tokens

In [62]:
def tokenize_corpus(corpus):
    all_tokens = []
    for content in corpus.values():
        all_tokens.extend(preprocess_text(content))
    return all_tokens

Entrainement des modèles tfidfj et doc2vec

In [120]:
# 1. Entraînement du modèle Doc2Vec
def train_doc2vec_model(documents, vector_size=100, window=1, epochs=20):
    tagged_data = [TaggedDocument(words=word_tokenize(doc.lower()), tags=[str(i)]) for i, doc in enumerate(documents)]
    model = Doc2Vec(tagged_data, vector_size=vector_size, window=2, min_count=1, workers=4, epochs=epochs)
    return model

# 2. Entraînement du modèle TF-IDF
def train_tfidf_model(documents):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(documents)
    return tfidf_matrix

### Différents modèles de text mining

Trouver les phrases comprenant un ou des mots

In [64]:
def find_sentences_with_words(corpus, words):
    sentences_with_words = []
    for content in corpus.values():
        sentences = sent_tokenize(content)
        for sentence in sentences:
            if all(word in sentence.lower() for word in words):
                sentences_with_words.append(sentence)
    return sentences_with_words

Analyse de sentiment

In [65]:
def analyze_word_sentiment(sentences):
    sia = SentimentIntensityAnalyzer()
    sentiment_scores = {"positive": 0, "negative": 0, "neutral": 0, "compound": 0}
    for sentence in sentences:
        scores = sia.polarity_scores(sentence)
        sentiment_scores["positive"] += scores["pos"]
        sentiment_scores["negative"] += scores["neg"]
        sentiment_scores["neutral"] += scores["neu"]
        sentiment_scores["compound"] += scores["compound"]

    # Moyenne des scores
    num_sentences = len(sentences)
    if num_sentences > 0:
        sentiment_scores = {key: value / num_sentences for key, value in sentiment_scores.items()}

    # Déterminer le sentiment principal
    if sentiment_scores["positive"] > sentiment_scores["negative"] and sentiment_scores["positive"] > sentiment_scores["neutral"]:
        sentiment = "Positif"
    elif sentiment_scores["negative"] > sentiment_scores["positive"] and sentiment_scores["negative"] > sentiment_scores["neutral"]:
        sentiment = "Négatif"
    else:
        sentiment = "Neutre"

    return sentiment_scores, sentiment

def word_sentiment_analysis(content, word):

    # Trouver les phrases contenant le mot
    sentences_with_word = find_sentences_with_words(content, word)
    if not sentences_with_word:
        return f"Le mot '{word}' n'apparaît pas dans le corpus."

    # Analyser le sentiment des phrases
    sentiment_scores, sentiment = analyze_word_sentiment(sentences_with_word)

    return {
        "word": word,
        "sentences_with_word": sentences_with_word,
        "sentiment_scores": sentiment_scores,
        "overall_sentiment": sentiment
    }


Bigrammes

In [66]:
def get_top_bigrams_doc2vec(corpus, top_n=10):
    # Tokenize the corpus
    all_tokens = tokenize_corpus(corpus)
    
    # Create bigrams
    bigrams = list(nltk.bigrams(all_tokens))
    
    # Prepare the bigrams for Doc2Vec
    bigram_documents = [TaggedDocument(words=[w1, w2], tags=[i]) for i, (w1, w2) in enumerate(bigrams)]
    
    # Train the Doc2Vec model
    model = Doc2Vec(bigram_documents, vector_size=100, window=2, min_count=1, workers=4, epochs=40)
    
    # Get the most similar bigrams
    bigram_vectors = [model.dv[i] for i in range(len(bigrams))]
    similarity_matrix = cosine_similarity(bigram_vectors)
    
    # Find the top N bigrams based on similarity
    top_bigrams = []
    for i in range(len(bigrams)):
        similar_indices = similarity_matrix[i].argsort()[-top_n:][::-1]
        for idx in similar_indices:
            if idx != i:
                top_bigrams.append((bigrams[i], bigrams[idx], similarity_matrix[i][idx]))
    
    # Sort the bigrams by similarity score
    top_bigrams = sorted(top_bigrams, key=lambda x: x[2], reverse=True)[:top_n]
    
    return top_bigrams

Analyse des tokens

In [67]:
def show_top_tokens(corpus, top_n=20):
    all_tokens = tokenize_corpus(corpus)
    freq_dist = nltk.FreqDist(all_tokens)
    return freq_dist.most_common(top_n)

Wordcloud

In [68]:
def generate_wordcloud(corpus, output_filename='wordcloud.png'):
    all_tokens = tokenize_corpus(corpus)
    text = ' '.join(all_tokens)
    wordcloud = WordCloud(background_color='white', stopwords=STOP_WORDS, max_words=30, min_font_size=10).generate(text)
    
    plt.figure(figsize=(8, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.savefig(f'output/{output_filename}', format='png')
    plt.show()

Tableau des tops tokens 

In [69]:
def generate_token_table(json_path, output_csv):
    STOP_WORDS = set(stopwords.words('english')) | set(string.punctuation)
    STEMMER = SnowballStemmer('english')
    with open(json_path, 'r', encoding='utf-8') as file:
        content = json.load(file)
    token_counts = {}  # Dictionnaire pour les occurrences
    document_counts = defaultdict(int)  # Nombre de documents contenant chaque token
    word_map = defaultdict(set)  # Mots originaux associés aux tokens stemmés


    for text in content.values():
        tokens = word_tokenize(text.lower())
        stemmed_tokens = [STEMMER.stem(token) for token in tokens if token.isalnum() and token not in STOP_WORDS]
        #Compter les occurrences globales
        for token in stemmed_tokens:
            if token in token_counts:
                token_counts[token] += 1
            else:
                token_counts[token] = 1

        #Compter les documents contenant chaque token (uniquement une fois par document)
        unique_tokens = set(stemmed_tokens)
        for token in unique_tokens:
            document_counts[token] += 1

        #Mapper les mots originaux aux tokens stemmés
        for word in tokens:
            if word.isalnum() and word not in STOP_WORDS:
                stemmed = STEMMER.stem(word)
                word_map[stemmed].add(word)

    # Construire et sauvegarder le tableau en une étape
    os.makedirs(os.path.dirname(output_csv), exist_ok=True)
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        
        # Écrire l'en-tête
        writer.writerow(["Token (stemmatisé)", "Mots associés", "Occurrences", "Articles concernés"])
        
        # Écrire les données
        for token, freq in token_counts.items():
            writer.writerow([
                token,
                ', '.join(word_map[token]),  # Convertir les mots associés en chaîne de caractères
                freq,
                document_counts[token]
            ])
    print(f"Tableau sauvegardé dans {output_csv}")

Calcul de similarité entre 2 documents au choix

In [70]:
def calculate_similarity(model, doc_id1, doc_id2):
    vec1 = model.dv[doc_id1]
    vec2 = model.dv[doc_id2]
    return cosine_similarity([vec1], [vec2])[0][0]

Recherche de documents similaires

In [71]:
def find_similar_docs(model, doc_id, top_n=5):
    return model.dv.most_similar(str(doc_id), topn=top_n)

### Clustering

Matrice de similarité par cosinus

In [133]:
def calculate_cosine_similarity(content, model):
    # Obtenir les vecteurs des documents
    doc_vectors = np.array([model.dv[i] for i in range(len(content))])  # Convertir en matrice NumPy

    # Calcul de la matrice de similarité par le cosinus
    similarity_matrix = cosine_similarity(doc_vectors)

    # Sauvegarder la matrice de similarité dans un fichier
    save_to_file(similarity_matrix, 'similarity_matrix.json')  # Sauvegarde en JSON

    return similarity_matrix

Nombre optimal de clusters

In [134]:
def find_optimal_clusters(vectors, max_clusters=10, min_clusters=2):
    scores = []
    for k in range(min_clusters, max_clusters + 1):
        kmeans = KMeans(n_clusters=k, random_state=0)
        labels = kmeans.fit_predict(vectors)
        scores.append(silhouette_score(vectors, labels))

    return np.argmax(scores) + min_clusters

Clusters

In [135]:
def cluster_documents(content, optimal_k):

    # Préparer les documents
    documents = list(content.values())

    # Entraîner le modèle Doc2Vec
    model = train_doc2vec_model(documents, vector_size=100, window=2, epochs=40)

    # Obtenir les vecteurs des documents
    doc_vectors = [model.dv[i] for i in range(len(content))]

    # Appliquer KMeans
    kmeans = KMeans(n_clusters=optimal_k, max_iter=100, n_init=10, random_state=42)
    kmeans.fit(doc_vectors)

    clusters = {}
    for i, doc in enumerate(doc_vectors):
        cluster_label = int(kmeans.labels_[i])
        document_key = list(content.keys())[i]  # Utilisation de la clé du dictionnaire
        if cluster_label not in clusters:
            clusters[cluster_label] = []
        clusters[cluster_label].append(document_key)

    # Sauvegarder les clusters
    save_to_file(clusters, 'clusters')
    
    # Créer un fichier texte avec les titres des documents regroupés par clusters
    with open('output/clusters_titles.txt', 'w', encoding='utf-8') as f:
        for cluster_label, docs in clusters.items():
            f.write(f"Cluster {cluster_label}:\n")
            for doc in docs:
                f.write(f"- {doc}\n")
            f.write("\n")

    # Calcul et sauvegarde de la matrice de similarité
    similarity_matrix = calculate_cosine_similarity(content, model)

    return clusters, similarity_matrix

Sauvegarde les titres complets des documents dans chaque cluster dans un fichier

In [136]:
def save_clusters_individually(clusters, content_path):
    # Charger le contenu à partir du fichier JSON
    with open(content_path, 'r', encoding='utf-8') as f:
        content = json.load(f)

    # Créer un dossier pour stocker les fichiers des clusters
    os.makedirs('output/clusters', exist_ok=True)
    
    # Sauvegarder chaque cluster dans un fichier JSON séparé
    for cluster_label, docs in clusters.items():
        cluster_content = {doc: content[doc] for doc in docs}  # Documents de ce cluster
        output_file = f'output/clusters/cluster_{cluster_label}.json'  # Nom du fichier JSON
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(cluster_content, f, ensure_ascii=False, indent=4)
        print(f"Cluster {cluster_label} sauvegardé dans {output_file}")

Links par clusters

In [137]:
def create_links_per_cluster(clusters_dir, original_links_path, output_dir):
    # Charger les liens originaux
    with open(original_links_path, 'r', encoding='utf-8') as f:
        original_links = json.load(f)

    # Créer un répertoire pour stocker les nouveaux fichiers
    os.makedirs(output_dir, exist_ok=True)

    # Parcourir tous les fichiers de clusters dans le répertoire spécifié
    for cluster_file in os.listdir(clusters_dir):
        cluster_path = os.path.join(clusters_dir, cluster_file)

        # Charger le fichier du cluster
        with open(cluster_path, 'r', encoding='utf-8') as f:
            cluster_nodes = json.load(f)

        # Préparer un dictionnaire pour les liens de ce cluster
        cluster_links = {}

        # Parcourir les nœuds du cluster et ajouter leurs liens
        for node in cluster_nodes:
            if node in original_links:
                cluster_links[node] = [
                    target for target in original_links[node] if target in cluster_nodes
                ]

        # Nommer le fichier de sortie basé sur le fichier du cluster
        cluster_label = os.path.splitext(cluster_file)[0]
        output_path = os.path.join(output_dir, f"{cluster_label}_links.json")

        # Sauvegarder les liens filtrés dans un fichier JSON
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(cluster_links, f, ensure_ascii=False, indent=4)

        print(f"Fichier de liens créé pour le cluster : {output_path}")

Graph de similarité

In [138]:
def generate_similarity_graph(content, similarity_matrix, output_file):
    G = nx.Graph()

    # Ajouter les nœuds
    for page in content.keys():
        G.add_node(page)

    # Ajouter les arêtes pondérées par les similarités textuelles
    pages = list(content.keys())
    for i in range(len(pages)):
        for j in range(i + 1, len(pages)):
            similarity = similarity_matrix[i, j]
            if similarity > 0:  # Ajouter une arête seulement si la similarité est positive
                G.add_edge(pages[i], pages[j], weight=similarity)

    # Sauvegarder le graphe en format GML
    nx.write_gml(G, output_file)
    print(f"Graphe de similarité créé : {output_file}")

## Choisis ce que tu veux lancer

Chemin du fichier

In [139]:
if __name__ == '__main__':
    content_path = 'content3.json'
    links_path = 'links3.json'
    content, links = load_data(content_path, links_path)

Entrainer les modèles

In [140]:
model = train_doc2vec_model(content, vector_size=100, window=2, epochs=40)
tfidf_matrix = train_tfidf_model(content)

### Basique

Phrases content un/des mots

In [None]:
words_to_find = ['profit', 'growth']
sentences = find_sentences_with_words(content, words_to_find)
for sentence in sentences:
    print(sentence)

Analyse de sentiment

In [None]:
word = "profit"  
result = word_sentiment_analysis(content, word)

print(f"Analyse de sentiment pour le mot '{result['word']}':")
print(f"Scores moyens : {result['sentiment_scores']}")
print(f"Sentiment global : {result['overall_sentiment']}")
print("\nExemples de phrases contenant le mot :")
for sentence in result["sentences_with_word"][:5]:  # Afficher jusqu'à 5 phrases
    print(f"- {sentence}")

Sauvegarder les tokens

In [67]:
output_file = 'cleaned_data'
clean_and_save_tokens(content, output_file)

Bigrammes

In [None]:
bigrams = get_top_bigrams(content)
print("Top Bigrams:", bigrams)

Tokens fréquents

In [None]:
top_tokens = show_top_tokens(content)
print("Top Tokens:", top_tokens)

Wordcloud

In [None]:
generate_wordcloud(content)

Generate Tokens Stats

In [None]:
output = 'output/token_table.csv'
generate_token_table(content_path, output)

Similarité entre 2 doc/ avec 1 doc

In [None]:
print("Similarité entre doc 0 et doc 1 :", calculate_similarity(model, '0', '1'))
print("Documents similaires à doc 0 :", find_similar_docs(model, '0'))

### Clusters

Clustering

In [None]:
doc_vectors = StandardScaler().fit_transform(doc_vectors)
tfidf_vectors = StandardScaler().fit_transform(tfidf_matrix.toarray())
combined_vectors = np.hstack((doc_vectors, tfidf_matrix.toarray()))

optimal_k = find_optimal_clusters(combined_vectors, max_clusters=10, min_clusters=3)
print(f"Nombre optimal de clusters : {optimal_k}")

In [None]:
clusters, similarity_matrix = cluster_documents(content, optimal_k)
save_clusters_individually(clusters, "content3.json")

In [None]:
clusters_dir = "output/clusters"
original_links_path = "links3.json"
output_dir = "cluster_links"

create_links_per_cluster(clusters_dir, original_links_path, output_dir)

JSON to GML

In [None]:
links = "links3.json"
output_file = "outputgml/graph.gml"
json_to_gml(links, output_file)

In [None]:
output_file1 = "outputgml/tokens_graph.gml"
output_file2 = "outputgml/clusters_graph.gml"
tokens_file = "output/cleaned_data.json"
clusters_file = "output/clusters.json"
json_to_gml(tokens_file, output_file1)
json_to_gml(clusters_file, output_file2)

In [None]:
for i in range(0,optimal_k):
    json_to_gml(f"cluster_links/cluster_{i}_links.json", f"outputgml/cluster_{i}.gml")