# Text mining

## Appuie pour faire tourner les fonctions

### Config

Les modules à installer se trouvent dans requirements.txt

Import and download

In [46]:
import json
import os
import re
import string
import nltk 
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import defaultdict
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ducar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ducar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ducar\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ducar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

Configuration

In [47]:
STOP_WORDS = set(stopwords.words('english')) | set(string.punctuation)
STOP_WORDS.update(['employee', 'organization', 'work', 'job', 'company', "'s"])
STEMMER = nltk.stem.SnowballStemmer('english')
SIA = SentimentIntensityAnalyzer()

### Fonctions utilitaires

Chargement du fichier de contenu

In [48]:
def load_data(content_path, links_path):
    if not os.path.exists(content_path):
        raise FileNotFoundError(f"Le fichier '{content_path}' est introuvable.")
    with open(content_path, 'r', encoding='utf-8') as file:
        content = json.load(file)
    if not os.path.exists(links_path):
        raise FileNotFoundError(f"Le fichier '{links_path}' est introuvable.")
    with open(links_path, 'r', encoding='utf-8') as file:
        links = json.load(file)
    
    return content, links

Sauvegarde des données

In [49]:
def save_content(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

Traitement du contenu texte

In [50]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [STEMMER.stem(token) for token in tokens if token not in STOP_WORDS and len(token) > 2]
    return tokens

Liste des tokens

In [51]:
def tokenize_corpus(corpus):
    all_tokens = []
    for content in corpus.values():
        all_tokens.extend(preprocess_text(content))
    return all_tokens

### Différents modèles de text mining

Traitement de texte spécifique à SIA, pour éviter de supprimer les "not" et autres

In [57]:
def preprocess_SIA(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    return tokens

Analyse de sentiments

In [58]:
def analyze_word_sentiment(word1, word2, corpus):
    positive, negative, neutral = 0, 0, 0
    for content in corpus.values():
        # Diviser le texte en phrases
        sentences = sent_tokenize(content)
        for sentence in sentences:
            # Prétraiter chaque phrase
            tokens = preprocess_SIA(sentence)
            # Vérifier si les deux mots sont présents
            if word1 in tokens and word2 in tokens:
                # Analyse des sentiments
                sentiment = SIA.polarity_scores(sentence)
                if sentiment['compound'] > 0.02:  # Seuil ajusté pour plus de sensibilité
                    positive += 1
                elif sentiment['compound'] < -0.02:
                    negative += 1
                else:
                    neutral += 1
    # Calculer les ratios
    total = positive + negative + neutral
    if total > 0:
        positive_ratio = f"{(positive / total * 100):.0f}%"
        negative_ratio = f"{(negative / total * 100):.0f}%"
    else:
        positive_ratio = negative_ratio = "0%"

    return {
        'Words': (word1, word2),
        'Positive': positive,
        'Negative': negative,
        'Neutral': neutral,
        'Total': total,
        'Positive Ratio': positive_ratio,
        'Negative Ratio': negative_ratio
    }

Bigrammes

In [59]:
def get_top_bigrams(corpus, freq_filter=5, top_n=10):
    all_tokens = tokenize_corpus(corpus)
    bigram_measures = BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(all_tokens)
    finder.apply_freq_filter(freq_filter)
    return finder.nbest(bigram_measures.likelihood_ratio, top_n)

Analyse des tokens

In [60]:
def show_top_tokens(corpus, top_n=20):
    all_tokens = tokenize_corpus(corpus)
    freq_dist = nltk.FreqDist(all_tokens)
    return freq_dist.most_common(top_n)

Wordcloud

In [61]:
def generate_wordcloud(corpus):
    all_tokens = tokenize_corpus(corpus)
    text = ' '.join(all_tokens)
    wordcloud = WordCloud(background_color='white',stopwords=STOP_WORDS,max_words=30,min_font_size=10).generate(text)
    plt.figure(figsize=(8, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

### Clustering

Sauvegarder les datas dans des fichiers JSON

In [72]:
os.makedirs('output', exist_ok=True)

def save_to_file(data, filename):
    if isinstance(data, np.ndarray):
        np.savetxt(f'output/{filename}.txt', data, fmt='%.4f')
    elif isinstance(data, dict):
        with open(f'output/{filename}.json', 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
    elif isinstance(data, list):
        with open(f'output/{filename}.txt', 'w', encoding='utf-8') as f:
            for line in data:
                f.write(line + '\n')


Matrice de similarité par cosinus

In [73]:

def calculate_cosine_similarity(content):
    vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english')
    X_tfidf = vectorizer.fit_transform(list(content.values()))
    
    # Calcul de la matrice de similarité par le cosinus
    similarity_matrix = cosine_similarity(X_tfidf)
    save_to_file(similarity_matrix, 'similarity_matrix')
    
    return similarity_matrix


Calculer le nombre optimal de clusters 
- par la méthode du coude

In [74]:
def find_optimal_clusters(content, max_k=10):
    vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english')
    X_tfidf = vectorizer.fit_transform(list(content.values()))
    distortions = []
    for k in range(1, max_k + 1):
        kmeans = KMeans(n_clusters=k, max_iter=100, n_init=5, random_state=42)
        kmeans.fit(X_tfidf)
        distortions.append(kmeans.inertia_)
    
    # Crée le répertoire output s'il n'existe pas
    os.makedirs('output', exist_ok=True)
    
    plt.figure(figsize=(8, 6))
    plt.plot(range(1, max_k + 1), distortions, marker='o')
    plt.xlabel('Nombre de Clusters')
    plt.ylabel('Distorsion')
    plt.title('Méthode du Coude')
    plt.savefig('output/elbow_method.png')
    plt.show()
    
    # Retourne le nombre optimal basé sur l'inflexion
    optimal_k = np.diff(distortions).argmin() + 2
    return optimal_k

Clusters

In [75]:
def cluster_documents(content, num_clusters):
    documents = list(content.values())
    
    vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english')
    X_tfidf = vectorizer.fit_transform(documents)
    
    # Application de l'algorithme KMeans pour le clustering
    kmeans = KMeans(n_clusters=num_clusters, max_iter=100, n_init=10, random_state=42)
    kmeans.fit(X_tfidf)
    
    clusters = {}
    for i, doc in enumerate(documents):
        cluster_label = int(kmeans.labels_[i])
        document_key = list(content.keys())[i]  # Utilisation de la clé du dictionnaire
        if cluster_label not in clusters:
            clusters[cluster_label] = []
        clusters[cluster_label].append(document_key)
    
    # Sauvegarder les clusters
    save_to_file(clusters, 'clusters')

    # Calcul et sauvegarde de la matrice de similarité
    similarity_matrix = calculate_cosine_similarity(content)
    
    return clusters, similarity_matrix

Sauvegarde les titres complets des documents dans chaque cluster dans un fichier

In [76]:
def display_cluster_content(clusters, content):
    output_lines = []
    for cluster_label, docs in clusters.items():
        output_lines.append(f'Cluster {cluster_label}:')
        for doc in docs:
            output_lines.append(f'  - {doc}')
        output_lines.append('')

    save_to_file(output_lines, 'cluster_titles')

## Choisis ce que tu veux lancer

Chemin du fichier

In [77]:
if __name__ == '__main__':
    content_path = 'content3.json'
    links_path = 'links3.json'
    content, links = load_data(content_path, links_path)

Analyse de sentiment

In [None]:
sentiment_result = analyze_word_sentiment('profit', 'growth', content)
print(sentiment_result)

Bigrammes

In [None]:
bigrams = get_top_bigrams(content)
print("Top Bigrams:", bigrams)

Tokens fréquents

In [None]:
top_tokens = show_top_tokens(content)
print("Top Tokens:", top_tokens)

Wordcloud

In [None]:
generate_wordcloud(content)

Clustering

In [None]:
num_clusters = find_optimal_clusters(content, max_k=10)
print(f'Nombre optimal de clusters : {num_clusters}')

# Clusteriser avec le nombre optimal
clusters, similarity_matrix = cluster_documents(content, num_clusters)
display_cluster_content(clusters, content)