<a href="https://colab.research.google.com/github/Yayahajdar/spam-ham/blob/main/spam2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
import random
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd



# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
df = pd.read_csv('data.csv',
         sep=',', header=None)
df = df.drop(0, axis=1)
df = df.drop(0, axis=0)
df

Unnamed: 0,1,2
1,ham,Go until wrong point crazy Available only in b...
2,ham,of lar Joking if u on
3,spam,Free entry in 2 a wily come to win FA Cup fina...
4,ham,U dun say so early for U c already then say
5,ham,Nah I don't think he goes to us he lives aroun...
...,...,...
5568,spam,This is the and time we have tried 2 contact u...
5569,ham,Will i b going to esplanade for home
5570,ham,Pity was in mood for that sonny other suggestions
5571,ham,The guy did some bitching but I acted like id ...


In [None]:
#Rename the name of columns
df.rename(columns = {1:'sit',2:'correct'}, inplace = True)
df

Unnamed: 0,sit,correct
1,ham,Go until wrong point crazy Available only in b...
2,ham,of lar Joking if u on
3,spam,Free entry in 2 a wily come to win FA Cup fina...
4,ham,U dun say so early for U c already then say
5,ham,Nah I don't think he goes to us he lives aroun...
...,...,...
5568,spam,This is the and time we have tried 2 contact u...
5569,ham,Will i b going to esplanade for home
5570,ham,Pity was in mood for that sonny other suggestions
5571,ham,The guy did some bitching but I acted like id ...


In [None]:
#separate classes
df_ham = df[df.sit=='ham']
df_spam = df[df.sit=='spam']

#convert to list

ham_list=df_ham['correct'].tolist()
spam_list= df_spam['correct'].tolist()
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['correct'],  df['sit'], test_size=0.2, random_state=42)

In [None]:
# Preprocess the data (remove stopwords, lowercase, tokenize, stem, lemmatize)
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [None]:
# process for correction the words and sentences
def preprocess(text):
    words = word_tokenize(text)
    words = [ps.stem(word.lower()) for word in words if word.isalpha() and word.lower() not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

X_train = [preprocess(str(text)) for text in X_train]
X_test = [preprocess(str(text)) for text in X_test]

In [None]:
# Vectorize the data using CountVectorizer
count_vectorizer = CountVectorizer()
X_train_count_vectorized = count_vectorizer.fit_transform(X_train)
X_test_count_vectorized = count_vectorizer.transform(X_test)

In [None]:
# Vectorize the data using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf_vectorized = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf_vectorized = tfidf_vectorizer.transform(X_test)

In [None]:
# Train a Multinomial Naive Bayes classifier on Count Vectorized data
classifier_count = MultinomialNB()
classifier_count.fit(X_train_count_vectorized, y_train)

In [None]:

# Make predictions on the test set (Count Vectorized)
predictions_count = classifier_count.predict(X_test_count_vectorized)

In [None]:

# Evaluate the accuracy (Count Vectorized)
accuracy_count = accuracy_score(y_test, predictions_count)
print("Accuracy (Count Vectorized):", accuracy_count)

Accuracy (Count Vectorized): 0.9811659192825112


In [None]:
# Train a Multinomial Naive Bayes classifier on TF-IDF Vectorized data
classifier_tfidf = MultinomialNB()
classifier_tfidf.fit(X_train_tfidf_vectorized, y_train)

# Make predictions on the test set (TF-IDF Vectorized)
predictions_tfidf = classifier_tfidf.predict(X_test_tfidf_vectorized)


# Evaluate the accuracy (TF-IDF Vectorized)
accuracy_tfidf = accuracy_score(y_test, predictions_tfidf)
print("Accuracy (TF-IDF Vectorized):", accuracy_tfidf)

Accuracy (TF-IDF Vectorized): 0.9641255605381166


In [92]:

# Input a new email
new_email =input("Enter the email or SmS: ")
# Preprocess the new email
preprocessed_email = preprocess(new_email)

vectorized_email = tfidf_vectorizer.transform([preprocessed_email])

prediction = classifier_count.predict(vectorized_email)

# Display the prediction
print("Predicted Class:", prediction[0])

Enter the email or SmS: winner
Predicted Class: spam
