In [None]:
import pandas as pd

file_path = 'spam.csv'
df = pd.read_csv(file_path, encoding='ISO-8859-1')


In [None]:
df = df[['v1', 'v2']]
df.columns = ['label', 'text']
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

df['tokens'] = df['text'].apply(word_tokenize)
df.head()

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

# Fonction pour lemmatiser une phrase
def lemmatize_sentence(sentence):
    return [lemmatizer.lemmatize(word) for word in sentence]

# Appliquer la lemmatisation
df['lemmas'] = df['tokens'].apply(lemmatize_sentence)
df.head()

In [None]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

# Fonction pour stemmatiser une phrase
def stem_sentence(sentence):
    return [stemmer.stem(word) for word in sentence]

# Appliquer la stemmatisation
df['stems'] = df['tokens'].apply(stem_sentence)
df.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x='label', data=df)
plt.title('Distribution des labels (Spam vs Ham)')
plt.show()

# Longueur des messages
df['text_length'] = df['text'].apply(len)
sns.histplot(df[df['label'] == 'ham']['text_length'], kde=True, color='blue', label='Ham')
sns.histplot(df[df['label'] == 'spam']['text_length'], kde=True, color='red', label='Spam')
plt.legend()
plt.title('Distribution de la longueur des messages')
plt.show()


In [None]:
import re
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = re.findall(r'\b\w+\b', text.lower())
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

df['cleaned_tokens'] = df['text'].apply(preprocess_text)


In [None]:
from collections import Counter

# Fréquence des mots
all_words = [word for tokens in df['cleaned_tokens'] for word in tokens]
word_freq = Counter(all_words)
common_words = word_freq.most_common(20)

# Afficher les mots les plus fréquents
words, counts = zip(*common_words)
plt.bar(words, counts)
plt.title('20 mots les plus fréquents')
plt.xticks(rotation=90)
plt.show()

# Fréquence des bigrams
def get_ngrams(tokens, n=2):
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [' '.join(ngram) for ngram in ngrams]

all_bigrams = [bigram for tokens in df['cleaned_tokens'] for bigram in get_ngrams(tokens, 2)]
bigram_freq = Counter(all_bigrams)
common_bigrams = bigram_freq.most_common(20)

# Afficher les bigrams les plus fréquents
bigrams, counts = zip(*common_bigrams)
plt.bar(bigrams, counts)
plt.title('20 bigrams les plus fréquents')
plt.xticks(rotation=90)
plt.show()


In [None]:
df['clean_text'] = df['text'].apply(lambda x: x.lower())
ham_words = Counter()
ham_messages = df[df['label'] == 'ham']['clean_text'].apply(lambda x: x.split())

for msg in ham_messages:
    ham_words.update(msg)

print("Mots les plus fréquents dans les messages 'ham':")
print(ham_words.most_common(50))

In [None]:
spam_words = Counter()
spam_messages = df[df['label'] == 'spam']['clean_text'].apply(lambda x: x.split())

for msg in spam_messages:
    spam_words.update(msg)

print("\nMots les plus fréquents dans les messages 'spam':")
print(spam_words.most_common(50))

In [None]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import seaborn as sns
import matplotlib.pyplot as plt

# Télécharger le lexique VADER
nltk.download('vader_lexicon')


In [None]:
sia = SentimentIntensityAnalyzer()

# Fonction pour obtenir le score de sentiment
def get_sentiment_scores(text):
    return sia.polarity_scores(text)

df['sentiment'] = df['text'].apply(get_sentiment_scores)

# Extraire les scores de sentiment
df['neg'] = df['sentiment'].apply(lambda x: x['neg'])
df['neu'] = df['sentiment'].apply(lambda x: x['neu'])
df['pos'] = df['sentiment'].apply(lambda x: x['pos'])
df['compound'] = df['sentiment'].apply(lambda x: x['compound'])

df.head()


In [None]:
sns.histplot(df[df['label'] == 'ham']['compound'], kde=True, color='blue', label='Ham')
sns.histplot(df[df['label'] == 'spam']['compound'], kde=True, color='red', label='Spam')
plt.legend()
plt.title('Distribution des scores de sentiment (Compound)')
plt.show()


In [None]:
# Messages avec les scores de sentiment les plus positifs
print("Messages les plus positifs:")
print(df.nlargest(5, 'compound')[['text', 'compound']])

# Messages avec les scores de sentiment les plus négatifs
print("Messages les plus négatifs:")
print(df.nsmallest(5, 'compound')[['text', 'compound']])


In [None]:
from PIL import Image
from wordcloud import  ImageColorGenerator
import numpy as np
import os
import matplotlib.image as mpimg
from wordcloud import WordCloud

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

# Convertir les listes de tokens en texte
ham_text = ' '.join([' '.join(msg) for msg in ham_messages])
spam_text = ' '.join([' '.join(msg) for msg in spam_messages])

# Fonction pour afficher le nuage de mots
def plot_word_cloud(text, title, mask_path):
    mask = np.array(Image.open(mask_path))
    wc = WordCloud(width=600, height=600, background_color='white', max_words=200, stopwords=stop_words, mask=mask, max_font_size=90, collocations=False, random_state=42)
    wc.generate(text)

    plt.figure(figsize=(10, 7))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, fontsize=10, pad=10)
    plt.show()

# Afficher les nuages de mots
plot_word_cloud(ham_text, 'Nuage de mots pour les messages ham', 'coeur.png')
plot_word_cloud(spam_text, 'Nuage de mots pour les messages spam', 'coeur.png')