In [67]:
import pandas as pd
import numpy as np
import tokenize as tk
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
## bayes
from sklearn.naive_bayes import  MultinomialNB
from sklearn.model_selection import train_test_split
# Text Normalization
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stemmer = SnowballStemmer("english")

data = pd.read_csv('spam_ham_dataset.csv')
data = data[['label', 'text']]
data = data.dropna() # remove rows with missing values (if any)


# Text Normalization
def normalize_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9]', ' ', text)
    return text
# Tokenization
def tokenize_text(text):
    return word_tokenize(text)

# Stopword Removal
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return [word for word in text if word not in stop_words]



# Preprocessing

data['text'] = data['text'].apply(normalize_text)
data['text'] = data['text'].apply(tokenize_text)
data['text'] = data['text'].apply(remove_stopwords)

 ## stem and lemmetize
data['text'] = data['text'].apply(lambda x: [stemmer.stem(y) for y in x])
data['text'] = data['text'].apply(lambda x: [WordNetLemmatizer().lemmatize(y) for y in x])

# remove the subject
data['text'] = data['text'].apply(lambda x: x[1:])

data.head()

# Join words back into a string
data['text'] = data['text'].apply(lambda x: ' '.join(x))
print(data.head())

X = data['text']
y = data['label']

# # split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)

# count vectorizer
count_vectorizer = CountVectorizer(ngram_range=(1, 2))
X_train_counts = count_vectorizer.fit_transform(X_train)
X_test_counts = count_vectorizer.transform(X_test)

# tf-idf transformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

# bayes
clf = MultinomialNB(alpha=0.01)
clf.fit(X_train_tfidf, y_train)

# predict
y_pred = clf.predict(X_test_tfidf)

# accuracy
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

# # confusion matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

# # classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))



def predict_spam(text):
    text = normalize_text(text)
    text = tokenize_text(text)
    text = remove_stopwords(text)
    text = [stemmer.stem(y) for y in text]
    text = [WordNetLemmatizer().lemmatize(y) for y in text]
    text = ' '.join(text)
    text = count_vectorizer.transform([text])
    text = tfidf_transformer.transform(text)
    return clf.predict(text)[0]


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/cihansariyildiz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cihansariyildiz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/cihansariyildiz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


  label                                               text
0   ham  enron methanol meter 988291 follow note gave m...
1   ham  hpl nom januari 9 2001 see attach file hplnol ...
2   ham  neon retreat ho ho ho around wonder time year ...
3  spam  photoshop window offic cheap main trend aba da...
4   ham  indian spring deal book teco pvr revenu unders...
0.9845360824742269
[[1107   14]
 [  10  421]]
              precision    recall  f1-score   support

         ham       0.99      0.99      0.99      1121
        spam       0.97      0.98      0.97       431

    accuracy                           0.98      1552
   macro avg       0.98      0.98      0.98      1552
weighted avg       0.98      0.98      0.98      1552

spam
