In [None]:
import spacy
from nltk.corpus import stopwords

In [None]:
# Charger le modèle spaCy
nlp = spacy.load("en_core_web_sm")  # ou "en_core_web_sm" pour l'anglais
stop_words = set(stopwords.words("english"))  # Changez pour "english" si nécessaire

def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.is_alpha and token.text.lower() not in stop_words]
    return " ".join(tokens)



In [2]:
# Exemple
paragraph1 = """ Machine learning (ML) is a branch of computer science that focuses on using data and 
                 algorithms to enable AI to imitate the way that humans learn, gradually improving its accuracy.
                 A Decision Process: In general, machine learning algorithms are used to make a prediction or
                 classification. Based on some input data, which can be labeled or unlabeled, your algorithm will 
                 produce an estimate about a pattern in the data.
                 An Error Function: An error function evaluates the prediction of the model. If there are known examples, 
                 an error function can make a comparison to assess the accuracy of the model.
                 A Model Optimization Process: If the model can fit better to the data points in the training set, 
                 then weights are adjusted to reduce the discrepancy between the known example and the model estimate. 
                 The algorithm will repeat this iterative “evaluate and optimize” process, updating weights autonomously 
                 until a threshold of accuracy has been met.
"""

paragraph2 = """ A neural network is a machine learning program, or model, that makes decisions in a manner similar to 
                 the human brain, by using processes that mimic the way biological neurons work together to identify 
                 phenomena, weigh options and arrive at conclusions. 
                 Every neural network consists of layers of nodes, or artificial neurons—an input layer, one or more 
                 hidden layers, and an output layer. Each node connects to others, and has its own associated weight 
                 and threshold. If the output of any individual node is above the specified threshold value, that node 
                 is activated, sending data to the next layer of the network. Otherwise, no data is passed along to the 
                 next layer of the network.
                 Neural networks rely on training data to learn and improve their accuracy over time. Once they are 
                 fine-tuned for accuracy, they are powerful tools in computer science and artificial intelligence, 
                 allowing us to classify and cluster data at a high velocity. Tasks in speech recognition or image 
                 recognition can take minutes versus hours when compared to the manual identification by human experts. 
                 One of the best-known examples of a neural network is Google’s search algorithm. 
                 Neural networks are sometimes called artificial neural networks (ANNs) or 
                 simulated neural networks (SNNs). They are a subset of machine learning, and at the heart of 
                 deep learning models. 
"""
paragraph3 = """ Deep learning is a subset of machine learning that uses multilayered neural networks, 
                 called deep neural networks, to simulate the complex decision-making power of the human brain. 
                 Some form of deep learning powers most of the artificial intelligence (AI) applications in our 
                 lives today.
                 The chief difference between deep learning and machine learning is the structure of the underlying 
                 neural network architecture. “Nondeep,” traditional machine learning models use simple neural networks 
                 with one or two computational layers. Deep learning models use three or more layers—but typically 
                 hundreds or thousands of layers—to train the models.
                 While supervised learning models require structured, labeled input data to make accurate outputs, 
                 deep learning models can use unsupervised learning. With unsupervised learning, deep learning models 
                 can extract the characteristics, features and relationships they need to make accurate outputs from raw,
                 unstructured data. Additionally, these models can even evaluate and refine their outputs for increased 
                 precision.
                 Deep learning is an aspect of data science that drives many applications and services that improve 
                 automation, performing analytical and physical tasks without human intervention. This enables many 
                 everyday products and services—such as digital assistants, voice-enabled TV remotes, credit card fraud 
                 detection, self-driving cars and generative AI.   
"""

corpus = [paragraph1, paragraph2, paragraph3]

cleaned_corpus = [preprocess_text(doc) for doc in corpus]

### Num_topic = 3

In [3]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel, LsiModel
from gensim.models.coherencemodel import CoherenceModel

# Préparation des données
texts = [doc.split() for doc in cleaned_corpus]
dictionary = Dictionary(texts)
corpus_gensim = [dictionary.doc2bow(text) for text in texts]
num_topics = 3

# Modèle LDA
lda_model = LdaModel(corpus=corpus_gensim, id2word=dictionary, num_topics=num_topics)

# Modèle LSA
lsa_model = LsiModel(corpus=corpus_gensim, id2word=dictionary, num_topics=num_topics)

# Affichage des thèmes
print("Thèmes LDA :")
for idx, topic in lda_model.show_topics(num_topics=num_topics, formatted=False):
    print(f"Thème {idx}: {[word for word, _ in topic]}")

print("\nThèmes LSA :")
for idx, topic in lsa_model.show_topics(num_topics=num_topics, formatted=False):
    print(f"Thème {idx}: {[word for word, _ in topic]}")


Thèmes LDA :
Thème 0: ['learning', 'model', 'network', 'neural', 'deep', 'layer', 'machine', 'use', 'datum', 'make']
Thème 1: ['learning', 'model', 'datum', 'network', 'neural', 'layer', 'algorithm', 'use', 'accuracy', 'make']
Thème 2: ['network', 'neural', 'datum', 'layer', 'model', 'artificial', 'learning', 'accuracy', 'output', 'machine']

Thèmes LSA :
Thème 0: ['learning', 'network', 'model', 'neural', 'layer', 'deep', 'datum', 'use', 'machine', 'output']
Thème 1: ['network', 'learning', 'neural', 'layer', 'deep', 'model', 'node', 'artificial', 'next', 'neuron']
Thème 2: ['algorithm', 'accuracy', 'model', 'error', 'estimate', 'prediction', 'function', 'datum', 'example', 'weight']


In [4]:
# Calcul de la cohérence pour LDA
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f"Cohérence LDA : {coherence_lda}")

# Calcul de la cohérence pour LSA
coherence_model_lsa = CoherenceModel(model=lsa_model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lsa = coherence_model_lsa.get_coherence()
print(f"Cohérence LSA : {coherence_lsa}")


Cohérence LDA : 0.4604710729205274
Cohérence LSA : 0.8405789185965551


### Num_topic = 7

In [5]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel, LsiModel
from gensim.models.coherencemodel import CoherenceModel

# Préparation des données
texts = [doc.split() for doc in cleaned_corpus]
dictionary = Dictionary(texts)
corpus_gensim = [dictionary.doc2bow(text) for text in texts]
num_topics = 7

# Modèle LDA
lda_model = LdaModel(corpus=corpus_gensim, id2word=dictionary, num_topics=num_topics, passes=10, random_state=42)

# Modèle LSA
lsa_model = LsiModel(corpus=corpus_gensim, id2word=dictionary, num_topics=num_topics)

#  Affichage des thèmes
print("Thèmes LDA :")
for idx, topic in lda_model.show_topics(num_topics=num_topics, formatted=False):
    print(f"Thème {idx}: {[word for word, _ in topic]}")

print("\nThèmes LSA :")
for idx, topic in lsa_model.show_topics(num_topics=num_topics, formatted=False):
    print(f"Thème {idx}: {[word for word, _ in topic]}")


Thèmes LDA :
Thème 0: ['learning', 'model', 'deep', 'network', 'neural', 'use', 'layer', 'datum', 'output', 'machine']
Thème 1: ['learning', 'model', 'deep', 'datum', 'network', 'make', 'human', 'use', 'output', 'algorithm']
Thème 2: ['network', 'neural', 'learning', 'layer', 'datum', 'model', 'artificial', 'output', 'neuron', 'node']
Thème 3: ['model', 'algorithm', 'datum', 'accuracy', 'learning', 'use', 'make', 'evaluate', 'estimate', 'process']
Thème 4: ['learning', 'model', 'use', 'datum', 'deep', 'algorithm', 'make', 'neural', 'machine', 'accuracy']
Thème 5: ['network', 'neural', 'layer', 'datum', 'learning', 'node', 'artificial', 'model', 'machine', 'output']
Thème 6: ['learning', 'datum', 'model', 'network', 'layer', 'algorithm', 'use', 'neural', 'know', 'accuracy']

Thèmes LSA :
Thème 0: ['learning', 'network', 'model', 'neural', 'layer', 'deep', 'datum', 'use', 'machine', 'output']
Thème 1: ['network', 'learning', 'neural', 'layer', 'deep', 'model', 'node', 'artificial', 'next

In [6]:
# Calcul de la cohérence pour LDA
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f"Cohérence LDA : {coherence_lda}")

# Calcul de la cohérence pour LSA
coherence_model_lsa = CoherenceModel(model=lsa_model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lsa = coherence_model_lsa.get_coherence()
print(f"Cohérence LSA : {coherence_lsa}")


Cohérence LDA : 0.440094150629418
Cohérence LSA : 0.8384286305451871


### Num_topic = 2

In [7]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel, LsiModel
from gensim.models.coherencemodel import CoherenceModel

# Préparation des données
texts = [doc.split() for doc in cleaned_corpus]
dictionary = Dictionary(texts)
corpus_gensim = [dictionary.doc2bow(text) for text in texts]
num_topics = 2

# Modèle LDA
lda_model = LdaModel(corpus=corpus_gensim, id2word=dictionary, num_topics=num_topics, passes=10, random_state=42)

# Modèle LSA
lsa_model = LsiModel(corpus=corpus_gensim, id2word=dictionary, num_topics=num_topics)

#  Affichage des thèmes
print("Thèmes LDA :")
for idx, topic in lda_model.show_topics(num_topics=num_topics, formatted=False):
    print(f"Thème {idx}: {[word for word, _ in topic]}")

print("\nThèmes LSA :")
for idx, topic in lsa_model.show_topics(num_topics=num_topics, formatted=False):
    print(f"Thème {idx}: {[word for word, _ in topic]}")


Thèmes LDA :
Thème 0: ['learning', 'network', 'neural', 'model', 'layer', 'datum', 'deep', 'use', 'machine', 'output']
Thème 1: ['model', 'algorithm', 'datum', 'accuracy', 'learning', 'make', 'use', 'process', 'know', 'example']

Thèmes LSA :
Thème 0: ['learning', 'network', 'model', 'neural', 'layer', 'deep', 'datum', 'use', 'machine', 'output']
Thème 1: ['network', 'learning', 'neural', 'layer', 'deep', 'model', 'node', 'artificial', 'next', 'neuron']


In [8]:
# Calcul de la cohérence pour LDA
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f"Cohérence LDA : {coherence_lda}")

# Calcul de la cohérence pour LSA
coherence_model_lsa = CoherenceModel(model=lsa_model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lsa = coherence_model_lsa.get_coherence()
print(f"Cohérence LSA : {coherence_lsa}")


Cohérence LDA : 0.4618991753139834
Cohérence LSA : 0.9195729277570108


### Visualisation

In [9]:
import pyLDAvis.gensim
lda_vis = pyLDAvis.gensim.prepare(lda_model, corpus_gensim, dictionary)
pyLDAvis.save_html(lda_vis, 'lda_visualization.html')