In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import re
import string

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\linai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\linai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# 1. LOAD AND PREPROCESS DATA
# Assume spam_ham_dataset is a CSV file with 'text' and 'label' columns
# 'label': 1 = spam, 0 = not spam
data = pd.read_csv("spam_ham_dataset.csv")  
data

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


In [4]:
data = data.drop(['Unnamed: 0','label'], axis=1)
data

Unnamed: 0,text,label_num
0,Subject: enron methanol ; meter # : 988291\r\n...,0
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,"Subject: photoshop , windows , office . cheap ...",1
4,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...
5166,Subject: put the 10 on the ft\r\nthe transport...,0
5167,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,Subject: industrial worksheets for august 2000...,0


In [5]:
# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    text = re.sub("\d+", "", text)  # Remove numbers
    words = word_tokenize(text)  # Tokenization
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return " ".join(words)

# Apply preprocessing
data['clean_text'] = data['text'].apply(preprocess_text)
data

Unnamed: 0,text,label_num,clean_text
0,Subject: enron methanol ; meter # : 988291\r\n...,0,subject enron methanol meter follow note gave ...
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",0,subject hpl nom january see attached file hpln...
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0,subject neon retreat ho ho ho around wonderful...
3,"Subject: photoshop , windows , office . cheap ...",1,subject photoshop windows office cheap main tr...
4,Subject: re : indian springs\r\nthis deal is t...,0,subject indian springs deal book teco pvr reve...
...,...,...,...
5166,Subject: put the 10 on the ft\r\nthe transport...,0,subject put ft transport volumes decreased con...
5167,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0,subject following noms hpl take extra mmcf wee...
5168,Subject: calpine daily gas nomination\r\n>\r\n...,0,subject calpine daily gas nomination julie men...
5169,Subject: industrial worksheets for august 2000...,0,subject industrial worksheets august activity ...


In [None]:
data['text'].iloc[0]

In [23]:
data['label_num'].value_counts()

label_num
0    3672
1    1499
Name: count, dtype: int64

In [6]:
# Train-test split
X = data['clean_text']
y = data['label_num']  # Use label_num as the target column
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to evaluate models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

#### BoW

In [7]:
# 2. VECTORIZATION AND MODELLING

## A. Bag of Words (BOW)
print("--- Bag of Words (BOW) ---")
vectorizer_bow = CountVectorizer()
X_train_bow = vectorizer_bow.fit_transform(X_train)
X_test_bow = vectorizer_bow.transform(X_test)

# Model with LogisticRegression
model_bow = LogisticRegression(max_iter=1000)
model_bow.fit(X_train_bow, y_train)
print("\nBOW Results:")
evaluate_model(model_bow, X_test_bow, y_test)

--- Bag of Words (BOW) ---

BOW Results:
Accuracy: 0.9806763285024155
Confusion Matrix:
 [[727  15]
 [  5 288]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.99       742
           1       0.95      0.98      0.97       293

    accuracy                           0.98      1035
   macro avg       0.97      0.98      0.98      1035
weighted avg       0.98      0.98      0.98      1035



In [None]:
''' 
donne des résultats solides, surtout pour des textes courts
Avantage : Simple et rapide à mettre en œuvre.
Inconvénient : Ne capture pas la sémantique (le sens des mots) et ne tient pas compte de l'ordre des mots.

'''

#### TF-IDF

In [8]:
## B. TF-IDF
print("\n--- TF-IDF ---")
vectorizer_tfidf = TfidfVectorizer()
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_tfidf = vectorizer_tfidf.transform(X_test)

# Model with LogisticRegression
model_tfidf = LogisticRegression(max_iter=1000)
model_tfidf.fit(X_train_tfidf, y_train)
print("\nTF-IDF Results:")
evaluate_model(model_tfidf, X_test_tfidf, y_test)


--- TF-IDF ---

TF-IDF Results:
Accuracy: 0.9855072463768116
Confusion Matrix:
 [[731  11]
 [  4 289]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       742
           1       0.96      0.99      0.97       293

    accuracy                           0.99      1035
   macro avg       0.98      0.99      0.98      1035
weighted avg       0.99      0.99      0.99      1035



In [None]:
'''     
TF-IDF pondère les mots importants, ce qui améliore la performance.
Avantage : Réduit l'impact des mots fréquents sans signification
Inconvénient : comme BoW, basé sur des mots individuels, pas sur leur contexte.

'''

#### Word2Vec

In [42]:
## C. Word2Vec
print("\n--- Word2Vec ---")
X_train_tokens = [text.split() for text in X_train]
X_test_tokens = [text.split() for text in X_test]

# Train Word2Vec model
w2v_model = Word2Vec(sentences=X_train_tokens, vector_size=350, window=10, min_count=1, workers=4)
w2v_model.train(X_train_tokens, total_examples=w2v_model.corpus_count, epochs=150)

# Function to convert text to Word2Vec vector
def text_to_w2v(tokens, model):
    vector = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vector, axis=0) if vector else np.zeros(100)

X_train_w2v = np.array([text_to_w2v(tokens, w2v_model) for tokens in X_train_tokens])
X_test_w2v = np.array([text_to_w2v(tokens, w2v_model) for tokens in X_test_tokens])

# Model with LogisticRegression
model_w2v = LogisticRegression(max_iter=500000)
model_w2v.fit(X_train_w2v, y_train)
print("\nWord2Vec Results:")
evaluate_model(model_w2v, X_test_w2v, y_test)


--- Word2Vec ---

Word2Vec Results:
Accuracy: 0.9806763285024155
Confusion Matrix:
 [[736   6]
 [ 14 279]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99       742
           1       0.98      0.95      0.97       293

    accuracy                           0.98      1035
   macro avg       0.98      0.97      0.98      1035
weighted avg       0.98      0.98      0.98      1035



In [None]:
'''   
Word2Vec capture la sémantique des mots, ce qui est utile pour des textes riches.
Avantage : Représentation vectorielle des mots basée sur leur contexte.
Inconvénient : Nécessite plus de données pour être efficace.
'''

#### Doc2Vec

In [None]:
'''    
Paramètre dm (Distributed Memory)
dm = 1 
    Chaque document est représenté par un vecteur unique.
    Le modèle utilise ce vecteur avec les vecteurs des mots voisins pour prédire un mot manquant dans une fenêtre donnée.
    Cela permet de capturer le contexte sémantique des documents et des mots 
    Exemple simplifié :

    Phrase : "Le chat dort sur le tapis"
    Pour prédire "tapis", le modèle utilise :
    Le vecteur du document
    Les mots voisins : "chat", "dort", "sur"


dm = 0
    Le modèle utilise uniquement le vecteur du document pour prédire des mots aléatoires présents dans le document.
    Contrairement à DM, il ignore complètement les vecteurs des mots voisins et le contexte local.

    Exemple simplifié :

    Phrase : "Le chat dort sur le tapis"
    Le modèle prend le vecteur du document et prédit des mots comme "chat", "tapis" sans se soucier de leur position.
'''

In [48]:
## D. Doc2Vec
print("\n--- Doc2Vec ---")
X_train_tagged = [TaggedDocument(words=text.split(), tags=[i]) for i, text in enumerate(X_train)]
X_test_tagged = [TaggedDocument(words=text.split(), tags=[i]) for i, text in enumerate(X_test)]

# Train Doc2Vec model 
d2v_model = Doc2Vec(vector_size=350, window=10, min_count=1, workers=4, epochs=150, dm=0)

d2v_model.build_vocab(X_train_tagged)
d2v_model.train(X_train_tagged, total_examples=d2v_model.corpus_count, epochs=d2v_model.epochs)

# Convert text to Doc2Vec vectors
X_train_d2v = np.array([d2v_model.infer_vector(text.split()) for text in X_train])
X_test_d2v = np.array([d2v_model.infer_vector(text.split()) for text in X_test])

# Model with LogisticRegression
model_d2v = LogisticRegression(max_iter=500000)
model_d2v.fit(X_train_d2v, y_train)
print("\nDoc2Vec Results:")
evaluate_model(model_d2v, X_test_d2v, y_test)


--- Doc2Vec ---

Doc2Vec Results:
Accuracy: 0.9594202898550724
Confusion Matrix:
 [[708  34]
 [  8 285]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.95      0.97       742
           1       0.89      0.97      0.93       293

    accuracy                           0.96      1035
   macro avg       0.94      0.96      0.95      1035
weighted avg       0.96      0.96      0.96      1035



In [46]:
## D. Doc2Vec
print("\n--- Doc2Vec ---")
X_train_tagged = [TaggedDocument(words=text.split(), tags=[i]) for i, text in enumerate(X_train)]
X_test_tagged = [TaggedDocument(words=text.split(), tags=[i]) for i, text in enumerate(X_test)]

# Train Doc2Vec model 
d2v_model = Doc2Vec(vector_size=100, window=5, min_count=1, workers=4, epochs=100, dm=0)
d2v_model.build_vocab(X_train_tagged)
d2v_model.train(X_train_tagged, total_examples=d2v_model.corpus_count, epochs=d2v_model.epochs)

# Convert text to Doc2Vec vectors
X_train_d2v = np.array([d2v_model.infer_vector(text.split()) for text in X_train])
X_test_d2v = np.array([d2v_model.infer_vector(text.split()) for text in X_test])

# Model with LogisticRegression
model_d2v = LogisticRegression(max_iter=500000)
model_d2v.fit(X_train_d2v, y_train)
print("\nDoc2Vec Results:")
evaluate_model(model_d2v, X_test_d2v, y_test)


--- Doc2Vec ---

Doc2Vec Results:
Accuracy: 0.9671497584541063
Confusion Matrix:
 [[714  28]
 [  6 287]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.96      0.98       742
           1       0.91      0.98      0.94       293

    accuracy                           0.97      1035
   macro avg       0.95      0.97      0.96      1035
weighted avg       0.97      0.97      0.97      1035



In [47]:
## D. Doc2Vec
print("\n--- Doc2Vec ---")
X_train_tagged = [TaggedDocument(words=text.split(), tags=[i]) for i, text in enumerate(X_train)]
X_test_tagged = [TaggedDocument(words=text.split(), tags=[i]) for i, text in enumerate(X_test)]

# Train Doc2Vec model 
d2v_model = Doc2Vec(vector_size=100, window=5, min_count=1, workers=4, epochs=100, dm=1)
d2v_model.build_vocab(X_train_tagged)
d2v_model.train(X_train_tagged, total_examples=d2v_model.corpus_count, epochs=d2v_model.epochs)

# Convert text to Doc2Vec vectors
X_train_d2v = np.array([d2v_model.infer_vector(text.split()) for text in X_train])
X_test_d2v = np.array([d2v_model.infer_vector(text.split()) for text in X_test])

# Model with LogisticRegression
model_d2v = LogisticRegression(max_iter=500000)
model_d2v.fit(X_train_d2v, y_train)
print("\nDoc2Vec Results:")
evaluate_model(model_d2v, X_test_d2v, y_test)


--- Doc2Vec ---

Doc2Vec Results:
Accuracy: 0.8859903381642512
Confusion Matrix:
 [[654  88]
 [ 30 263]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.88      0.92       742
           1       0.75      0.90      0.82       293

    accuracy                           0.89      1035
   macro avg       0.85      0.89      0.87      1035
weighted avg       0.90      0.89      0.89      1035



In [None]:
'''    
Doc2Vec est moins performant que BOW et TF-IDF dans ce cas.

Le modèle DBOW (Distributed Bag of Words : dm = 0) est meilleur que le modèle DM (Distributed Memory : dm = 1), 
car il est plus adapté à des petits datasets.

'''

#### Amélioration

In [None]:
'''
1- Enrichissement des données : Ajouter plus de données pour entraîner Word2Vec et Doc2Vec.

2- Ajustement des hyperparamètres :
Pour Word2Vec et Doc2Vec :
    - Ajuster vector_size, window, epochs, et min_count.
    - Tester d'autres classifieurs ceux basé sur les réseaux de neurones.

3- Combinaison : Fusionner TF-IDF et Word2Vec, utiliser des modèles d'ensemble.

'''