In [94]:
import os
os.environ["OMP_NUM_THREADS"] = '4'

In [106]:
from sklearn.datasets import fetch_20newsgroups
categories = [
'alt.atheism',
'talk.religion.misc',
'comp.graphics',
'sci.space',
]
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD
import spacy
import pandas as pd
nlp = spacy.load("en_core_web_sm")
# warnings imports
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans, MiniBatchKMeans


from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [107]:
#import dataset
dataset = fetch_20newsgroups(subset='all', categories=categories,
shuffle=True, random_state=42)
#save labels
labels = dataset.target
#get the unique labels
true_k = np.unique(labels).shape[0]

print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))

3387 documents
4 categories


In [108]:
# Display a text and its label
index = 5  # Choose an index, e.g., 0 for the first document
text = dataset.data[index]  # Retrieve the text
label = dataset.target[index]  # Retrieve the corresponding label
label_name = dataset.target_names[label]  # Get the human-readable label

print(f"Text:\n{text}\n")
print(f"Label (numerical): {label}")
print(f"Label (category name): {label_name}")


Text:
From: fitz@cse.ogi.edu (Bob Fitzsimmons)
Subject: Re: VGA Graphics Library
Keywords: C, library, graphics
Article-I.D.: ogicse.53715
Organization: Oregon Grad. Inst. Computer Science and Eng., Beaverton
Lines: 26

In article <2054@mwca.UUCP> bill@mwca.UUCP (Bill Sheppard) writes:
>Many high-end graphics cards come with C source code for doing basic graphics
>sorts of things (change colors, draw points/lines/polygons/fills, etc.).  Does
>such a library exist for generic VGA graphics cards/chips, hopefully in the
>public domain?  This would be for the purpose of compiling under a non-DOS
>operating system running on a standard PC.
>

I'm also interested in info both public domain and commercial graphics library 
package to do PC VGA graphics.  

I'm currently working on a realtime application running on a PCC with a 
non-DOS kernel that needs to do some simple graphics.  I'm not sure if 
reentrancy of the graphics library is going to be an issue or not.  
I suspect I'll implement t

* n_samples = le nombre de document
* n_features = le nombre total de termes uniques dans le vocabulaire  

In [109]:
data = dataset.data
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english',
use_idf=True)
X = tfidf_vectorizer.fit_transform(data)
#The X object is now our input vector which contains the TF-IDF representation of our
#dataset. 
print("n_samples: %d, n_features: %d" % X.shape)

n_samples: 3387, n_features: 24545


In [None]:
#Dimensionality Reduction
# Vectorizer results are normalized, which makes KMeans behave better
    # Since LSA/SVD results are not normalized, we have to redo the normalization.

    #If we do not normalize the data, variables with different scaling 
    # will be weighted differently in the distance formula 
    # that is being optimized during training.


n_components = 5 #Sets the number of latent dimensions (topics) to which the data is reduced. 
                  #This controls how much the dimensionality of the dataset is reduced.
# C'est le nombre de dimensions latentes (ou sujets, ou concepts cachés) que l'on souhaite conserver dans les données après la réduction.
# Par exemple, si les données d'origine ont 100 dimensions (colonnes ou caractéristiques), on peut les réduire à seulement 5 dimensions grâce à n_components = 5.

#Performs truncated singular value decomposition (SVD) on the input matrix 
#X to reduce its dimensionality.
svd = TruncatedSVD(n_components)
# C'est une méthode mathématique qui décompose une matrice en trois parties (singularités) pour en extraire l'information importante.
# Le mot "tronquée" signifie que l'on conserve seulement les n_components dimensions les plus importantes et on ignore les autres.

# Imaginez une matrice qui contient des informations sur 1000 documents et 10 000 mots (grande dimensionnalité).
# Après réduction avec TruncatedSVD à 5 dimensions, chaque document sera représenté par seulement 5 chiffres (les concepts principaux), au lieu de 10 000 (les mots).




normalizer = Normalizer(copy=False)
#Combines the SVD and normalization steps into a single pipeline for streamlined processing. 
lsa = make_pipeline(svd, normalizer)
# C'est une fonction qui assemble plusieurs étapes de transformation en une seule "pipeline".
# Cela permet d'appliquer plusieurs transformations aux données, étape par étape, de manière automatisée et ordonnée.



#The final X is the input which we will be using. 
# It has been cleaned, TF-IDF transformed, and its dimensions reduced.
X_reduced = lsa.fit_transform(X)

#scikit-learn offers two implementations of kmeans:
# either in mini-batches or without
minibatch = False
if minibatch:
   km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
   init_size=1000, batch_size=1000)
else:
   km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
   # km = KMeans(n_clusters = 3, random_state = 0, n_init='auto')
km.fit(X_reduced)
# top words per cluster
print("Clustering sparse data with %s" % km)

Clustering sparse data with KMeans(max_iter=100, n_clusters=4, n_init=1)


In [None]:
# Ce code effectue une série d'opérations pour identifier les termes représentatifs (mots ou caractéristiques importantes) associés à des clusters formés par un algorithme de clustering

# L'objectif est d'obtenir les termes principaux pour chaque cluster.



original_space_centroids = svd.inverse_transform(km.cluster_centers_)
# svd : C'est l'objet de réduction de dimensionnalité basé sur la décomposition en valeurs singulières tronquées (Truncated SVD).
# km : KMeans de Scikit-learn, qui a déjà identifié des centroïdes représentant les différents clusters.
# Les centroïdes des clusters (km.cluster_centers_) sont dans l'espace réduit (celui produit par svd). Cette ligne applique une transformation inverse pour ramener les centroïdes à l'espace original (avant la réduction de dimensionnalité).
# Pourquoi : Dans l'espace original, les dimensions correspondent aux termes réels (mots), ce qui permet de mieux interpréter les clusters.



order_centroids = original_space_centroids.argsort()[:, ::-1]
# argsort() trie les indices des termes dans chaque cluster selon leurs valeurs dans les centroïdes.
# [:, ::-1] signifie que l'ordre des indices est inversé pour avoir les termes avec les valeurs les plus grandes en premier.
# Pourquoi : Les valeurs les plus grandes indiquent les termes les plus importants ou les plus représentatifs du cluster.


terms = tfidf_vectorizer.get_feature_names_out()
# tfidf_vectorizer : Un objet qui transforme le texte en une matrice TF-IDF, où chaque colonne correspond à un terme (mot).
for i in range(true_k):
   print("Cluster %d:" % i)
   for ind in order_centroids[i, :10]:
      print(' %s' % terms[ind])


Cluster 0:
 graphics
 space
 image
 com
 nasa
 university
 posting
 images
 program
 file
Cluster 1:
 god
 people
 com
 don
 say
 jesus
 think
 just
 believe
 morality
Cluster 2:
 space
 henry
 toronto
 nasa
 access
 com
 digex
 pat
 gov
 zoo
Cluster 3:
 sandvik
 kent
 apple
 newton
 com
 god
 jesus
 alink
 ksand
 cookamunga


In [112]:
print("First method:")
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f "
      % metrics.silhouette_score(X, km.labels_, sample_size=1000)) #Mesure la séparation entre les clusters en comparant les distances intra- et inter-clusters (valeurs proches de 1 indiquent de bons clusters).
#Note: You might see different results, as machine learning 
# algorithms do not produce the exact same results each time.
# km.predict(X_test) to test our model



First method:
Homogeneity: 0.592
Completeness: 0.669
V-measure: 0.628
Adjusted Rand-Index: 0.624
Silhouette Coefficient: 0.006 


##### Méthode 2 K-means

In [104]:
#imports the KMeans algorithm from the scikit-learn library and 
# creates an instance of it with three clusters, a random state of 0, 
# and automatic initialization
#KMeans algorithm is a clustering algorithm that groups 
# similar data points together based on their distance from each other

#random runs: This affects how the initial cluster centroids are chosen 
#and ensures consistent results across multiple runs.
#n_init=auto: Automatically runs 10 initializations and picks the best one based on inertia (objective function).


kmeans = KMeans(n_clusters = 3, random_state = 0, n_init='auto')
#The fit method is then called on the normalized training data 
# to train the KMeans model on the data.
kmeans.fit(X)
print("Second method:")
original_space_centroids = svd.inverse_transform(km.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
terms = tfidf_vectorizer.get_feature_names_out()
for i in range(true_k):
   print("Cluster %d:" % i)
   for ind in order_centroids[i, :10]:
      print(' %s' % terms[ind])


Second method:
Cluster 0:
 space
 graphics
 nasa
 com
 access
 image
 university
 gov
 posting
 host
Cluster 1:
 god
 people
 jesus
 don
 com
 say
 believe
 think
 bible
 just
Cluster 2:
 sgi
 livesey
 keith
 wpd
 solntze
 jon
 com
 caltech
 morality
 moral
Cluster 3:
 sandvik
 kent
 apple
 newton
 com
 jesus
 alink
 ksand
 god
 cookamunga


In [105]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f "
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))


Homogeneity: 0.390
Completeness: 0.542
V-measure: 0.453
Adjusted Rand-Index: 0.391
Silhouette Coefficient: 0.007 


##### Naïve Bayes

In [113]:
# ---- 2. Classification supervisée avec Naïve Bayes ----
print("\n=== Naïve Bayes Classification ===")
X_train, X_test, y_train, y_test = train_test_split(X, dataset.target, test_size=0.3, random_state=42)
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Prédictions et évaluation
y_pred = nb.predict(X_test)
print(classification_report(y_test, y_pred))


=== Naïve Bayes Classification ===
              precision    recall  f1-score   support

           0       0.84      0.98      0.90       224
           1       0.95      0.99      0.97       297
           2       0.96      0.98      0.97       307
           3       0.99      0.69      0.81       189

    accuracy                           0.93      1017
   macro avg       0.93      0.91      0.91      1017
weighted avg       0.94      0.93      0.93      1017



##### TFIDF - BOW

In [114]:
# ---- 3. Comparaison Bag of Words vs TF-IDF ----
print("\n=== Comparison: Bag of Words vs TF-IDF ===")

# Vectorisation Bag of Words
bow_vectorizer = CountVectorizer(stop_words='english', max_df=0.5, min_df=2)
X_bow = bow_vectorizer.fit_transform(dataset.data)

# Naïve Bayes avec Bag of Words
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(X_bow, dataset.target, test_size=0.3, random_state=42)
nb_bow = MultinomialNB()
nb_bow.fit(X_train_bow, y_train_bow)

# Prédictions et évaluation Bag of Words
y_pred_bow = nb_bow.predict(X_test_bow)
print("Bag of Words Performance:")
print(classification_report(y_test_bow, y_pred_bow))

# Naïve Bayes avec TF-IDF (déjà fait, rappel des résultats pour comparaison)
print("TF-IDF Performance:")
print(classification_report(y_test, y_pred))



=== Comparison: Bag of Words vs TF-IDF ===
Bag of Words Performance:
              precision    recall  f1-score   support

           0       0.89      0.96      0.92       224
           1       0.97      0.99      0.98       297
           2       0.98      0.96      0.97       307
           3       0.94      0.85      0.89       189

    accuracy                           0.95      1017
   macro avg       0.94      0.94      0.94      1017
weighted avg       0.95      0.95      0.95      1017

TF-IDF Performance:
              precision    recall  f1-score   support

           0       0.84      0.98      0.90       224
           1       0.95      0.99      0.97       297
           2       0.96      0.98      0.97       307
           3       0.99      0.69      0.81       189

    accuracy                           0.93      1017
   macro avg       0.93      0.91      0.91      1017
weighted avg       0.94      0.93      0.93      1017



In [117]:
# ---- 4. Tester le meilleur modèle sur de nouvelles données ----
print("\n=== Testing Best Model on New Data ===")

new_documents = [
    "The launch of the satellite was a major milestone in space exploration.",
    "Scientists developed a new vaccine that promises to combat emerging viruses.",
    "The new computer graphics card delivers stunning performance."
]

tfidf_vectorizer.fit(dataset.data)
# Utiliser Naïve Bayes (modèle supervisé) sur les nouveaux textes
new_X_tfidf = tfidf_vectorizer.transform(new_documents)
new_predictions = nb.predict(new_X_tfidf)

# Afficher les résultats
for doc, pred in zip(new_documents, new_predictions):
    print(f"Document: {doc}")
    print(f"Predicted Category: {dataset.target_names[pred]}\n")


=== Testing Best Model on New Data ===
Document: The launch of the satellite was a major milestone in space exploration.
Predicted Category: sci.space

Document: Scientists developed a new vaccine that promises to combat emerging viruses.
Predicted Category: sci.space

Document: The new computer graphics card delivers stunning performance.
Predicted Category: comp.graphics

