In [None]:
import pandas as pd
df = pd.read_csv('./email.csv')
df.head()

In [None]:
df.info()

In [None]:
df['Category'].unique()

In [None]:
df['Category'].value_counts()

In [None]:
diff_rows = df[df['Category'].isin(['ham', 'spam'])]
df = diff_rows

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['Category'] = encoder.fit_transform(df['Category'])
df = df.rename(columns={'Category': 'spam', 'Message': 'message'})
df.head()

In [None]:
df.duplicated().sum()

In [None]:
df = df.drop_duplicates(keep='first')
df.shape

**Analysis**

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.bar(['Ham', 'Spam'], df['spam'].value_counts())
plt.subplot(1, 2, 2)
plt.pie(df['spam'].value_counts(), labels=['Ham', 'Spam'], autopct='%1.1f%%')
plt.show()

In [None]:
df["chars"] = df.apply(lambda x: len(x["message"]), axis=1)
df.head()

In [None]:
df[df.spam == 0].describe()

In [None]:
df[df.spam == 1].describe()

**preprocessing**

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

stemmer = PorterStemmer()

df = df.reset_index(drop=True)

messages = df['message']
spam = df['spam']

corpus = []
for i in range(len(messages)):
    data = re.sub('[^a-zA-Z0-9]',' ',messages[i])
    data = data.lower()
    data = data.split()
    data = [stemmer.stem(word)for word in data if word not in set(stopwords.words('english'))]
    data = ' '.join(data)
    corpus.append(data)
df["new-msg"] = corpus
df.head()

**model**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = CountVectorizer()
# tf = TfidfVectorizer()
x = cv.fit_transform(corpus).toarray()
x.shape

In [None]:
y = df.spam.values

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=2)

In [None]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix

bb = BernoulliNB()
nb = MultinomialNB()
rf = RandomForestClassifier()

In [None]:
bb.fit(x_train,y_train)
y_pred1 = bb.predict(x_test)
print(accuracy_score(y_pred1,y_test))
print(confusion_matrix(y_pred1,y_test))
print(precision_score(y_pred1,y_test))

In [None]:
nb.fit(x_train,y_train)
y_pred2 = nb.predict(x_test)
print(accuracy_score(y_pred2,y_test))
print(confusion_matrix(y_pred2,y_test))
print(precision_score(y_pred2,y_test))

In [None]:
rf.fit(x_train,y_train)
y_pred3 = rf.predict(x_test)
print(accuracy_score(y_pred3,y_test))
print(confusion_matrix(y_pred3,y_test))
print(precision_score(y_pred3,y_test))

**save model**

In [None]:
import pickle
with open('../model.pkl', 'wb') as f:
    pickle.dump(nb, f)

with open('../vectorizer.pkl', 'wb') as f:
    pickle.dump(cv, f)