# Text mining

## Appuie pour faire tourner les fonctions

### Config

Import and download

In [None]:
import json
import os
import re
import string
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import defaultdict
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('punkt_tab')

: 

Configuration

In [None]:
STOP_WORDS = set(stopwords.words('english')) | set(string.punctuation)
STOP_WORDS.update(['employee', 'organization', 'work', 'job', 'company', "'s"])
STEMMER = nltk.stem.SnowballStemmer('english')
SIA = SentimentIntensityAnalyzer()

### Fonctions utilitaires

Chargement du fichier de contenu

In [None]:
def load_data(content_path, links_path):
    if not os.path.exists(content_path):
        raise FileNotFoundError(f"Le fichier '{content_path}' est introuvable.")
    with open(content_path, 'r', encoding='utf-8') as file:
        content = json.load(file)
    if not os.path.exists(links_path):
        raise FileNotFoundError(f"Le fichier '{links_path}' est introuvable.")
    with open(links_path, 'r', encoding='utf-8') as file:
        links = json.load(file)
    
    return content, links

Sauvegarde des données

In [None]:
def save_content(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

Traitement du contenu texte

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [STEMMER.stem(token) for token in tokens if token not in STOP_WORDS and len(token) > 2]
    return tokens

Liste des tokens

In [None]:
def tokenize_corpus(corpus):
    all_tokens = []
    for content in corpus.values():
        all_tokens.extend(preprocess_text(content))
    return all_tokens

### Différents modèles de text mining

Analyse de sentiments

In [None]:
def analyze_word_sentiment(word1, word2, corpus):
    positive, negative, neutral = 0, 0, 0
    for content in corpus.values():
        sentences = sent_tokenize(content)
        for sentence in sentences:
            tokens = preprocess_text(sentence)
            if word1 in tokens and word2 in tokens:
                sentiment = SIA.polarity_scores(sentence)
                if sentiment['compound'] > 0.05:
                    positive += 1
                elif sentiment['compound'] < -0.05:
                    negative += 1
                else:
                    neutral += 1
    total = positive + negative + neutral
    return {'Words': (word1, word2),'Positive': positive,'Negative': negative,'Neutral': neutral,'Total': total,
            'Positive Ratio': f"{(positive / total * 100):.0f}%" if total else '0%','Negative Ratio': f"{(negative / total * 100):.0f}%" if total else '0%'}

Bigrammes

In [None]:
def get_top_bigrams(corpus, freq_filter=5, top_n=10):
    all_tokens = tokenize_corpus(corpus)
    bigram_measures = BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(all_tokens)
    finder.apply_freq_filter(freq_filter)
    return finder.nbest(bigram_measures.likelihood_ratio, top_n)

Analyse des tokens

In [None]:
def show_top_tokens(corpus, top_n=20):
    all_tokens = tokenize_corpus(corpus)
    freq_dist = nltk.FreqDist(all_tokens)
    return freq_dist.most_common(top_n)

Wordcloud

In [None]:
def generate_wordcloud(corpus):
    all_tokens = tokenize_corpus(corpus)
    text = ' '.join(all_tokens)
    wordcloud = WordCloud(background_color='white',stopwords=STOP_WORDS,max_words=30,min_font_size=10).generate(text)
    plt.figure(figsize=(8, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

Clustering

In [None]:
def calculate_cosine_similarity(content):
    """
    Calcule la matrice de similarité par le cosinus entre tous les documents du contenu.
    """
    vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english')
    X_tfidf = vectorizer.fit_transform(list(content.values()))
    
    # Calcul de la matrice de similarité par le cosinus
    similarity_matrix = cosine_similarity(X_tfidf)
    
    return similarity_matrix

def cluster_documents(content, num_clusters=3):
    """
    Fonction qui effectue le clustering des documents et retourne les résultats sous forme de dictionnaire.
    """
    documents = list(content.values())
    
    vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english')
    X_tfidf = vectorizer.fit_transform(documents)
    
    # Application de l'algorithme KMeans pour le clustering
    kmeans = KMeans(n_clusters=num_clusters, max_iter=100, n_init=1)
    kmeans.fit(X_tfidf)
    
    clusters = {}
    for i, doc in enumerate(documents):
        cluster_label = int(kmeans.labels_[i])
        document_key = list(content.keys())[i]  # Utilisation de la clé du dictionnaire
        if cluster_label not in clusters:
            clusters[cluster_label] = []
        clusters[cluster_label].append(document_key)
    
    # Calcul de la matrice de similarité par le cosinus
    similarity_matrix = calculate_cosine_similarity(content)
    
    return clusters, similarity_matrix

def display_cluster_content(clusters, max_docs=3):
    """
    Affiche les titres de 3 documents pour chaque cluster.
    """
    print("Titres des documents par cluster (maximum 3 documents par cluster) :")
    for cluster_label, docs in clusters.items():
        print(f"\nCluster {cluster_label}:")
        # Limiter à 3 documents par cluster
        for doc in docs[:max_docs]:
            print(f"  - {doc}")
        if len(docs) > max_docs:
            print("  ...")  # Indication qu'il y a plus de documents dans le cluster

def display_cluster_info(clusters, similarity_matrix):
    """
    Affiche les informations des clusters et la matrice de similarité.
    """
    # Affichage du nombre de documents par cluster
    print("Nombre de documents par cluster :")
    for cluster_label, docs in clusters.items():
        print(f"Cluster {cluster_label}: ({len(docs)} documents)")
    
    # Affichage de la matrice de similarité
    print("\nMatrice de Similarité par le Cosinus :")
    print(similarity_matrix)

## Choisis ce que tu veux lancer

Chemin du fichier

In [None]:
if __name__ == '__main__':
    file_path = 'content1.json'
    corpus = load_data(content_path, links_path)

Analyse de sentiment

In [None]:
sentiment_result = analyze_word_sentiment('profit', 'growth', corpus)
print(sentiment_result)

Bigrammes

In [None]:
bigrams = get_top_bigrams(corpus)
print("Top Bigrams:", bigrams)

Tokens fréquents

In [None]:
top_tokens = show_top_tokens(corpus)
print("Top Tokens:", top_tokens)

Wordcloud

In [None]:
generate_wordcloud(corpus)

Clustering

In [None]:
clusters, similarity_matrix = cluster_documents(content, num_clusters=5)

# Affichage des informations des clusters
display_cluster_info(clusters, similarity_matrix)

# Affichage des titres des documents pour chaque cluster (maximum 3 documents par cluster)
display_cluster_content(clusters, max_docs=5)