In [52]:
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [4]:
email_dataset = pd.read_csv('mail_data.csv')
print(email_dataset.shape)

(5572, 2)


In [20]:
# check if it have null row
print(email_dataset.isnull().sum())

Category    0
Message     0
dtype: int64


In [22]:
# check duplicate row
print(email_dataset.duplicated().sum())

0


In [30]:
# if it have duplicate row remove
email_dataset.drop_duplicates(inplace=True)

In [32]:
print(email_dataset.duplicated().sum())

0


In [34]:
print(email_dataset.head())

  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


In [46]:
print(email_dataset.Category.value_counts())

Category
0    4516
1     641
Name: count, dtype: int64


In [48]:
# change Category string row into num row
label_encoder = LabelEncoder()
email_dataset['Category'] = label_encoder.fit_transform(email_dataset['Category'])

In [50]:
# split the dataset into featue and label
x = email_dataset['Message']
y = email_dataset['Category']

In [54]:
# vectorizer the feature col
vectorizer = TfidfVectorizer()
vectorizer.fit(x)

In [58]:
x = vectorizer.transform(x)
y = email_dataset['Category']

In [60]:
print(x)
print(y)

  (0, 1079)	0.3254824637577375
  (0, 1313)	0.24682163289537062
  (0, 1761)	0.3105790823420053
  (0, 1763)	0.27452746613871426
  (0, 2057)	0.27452746613871426
  (0, 2334)	0.25142216206874096
  (0, 3567)	0.14728383239686252
  (0, 3611)	0.15221254465391035
  (0, 3651)	0.18169112440169724
  (0, 4110)	0.10777814259403067
  (0, 4370)	0.3254824637577375
  (0, 4497)	0.27452746613871426
  (0, 5567)	0.1580989753178226
  (0, 5954)	0.25395808207313836
  (0, 7690)	0.1558478886324521
  (0, 8080)	0.22848058326362672
  (0, 8320)	0.18206053717134293
  (0, 8544)	0.22981732189151768
  (1, 4338)	0.5234057786973465
  (1, 4533)	0.40693812451964195
  (1, 5534)	0.27641681599588036
  (1, 5563)	0.5465710490257072
  (1, 8446)	0.43046670700566175
  (2, 77)	0.2326951819115437
  (2, 401)	0.22204039882555796
  :	:
  (5155, 1790)	0.13632832358826047
  (5155, 1798)	0.2821514894405624
  (5155, 2602)	0.18436456311060026
  (5155, 2901)	0.24293814439354142
  (5155, 3319)	0.12206026813524155
  (5155, 3369)	0.16312435385006

In [62]:
# separate the feature into training and testing data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,  stratify=y, random_state=2)

In [66]:
# train a model
model = MultinomialNB()
model.fit(x_train, y_train)

In [70]:
# train predicition
train_predicition = model.predict(x_train)
train_accuracy = accuracy_score(y_train, train_predicition)
print(f"the train predicition is {train_accuracy}")

the train predicition is 0.9655757575757575


In [72]:
# test predicition
test_predicition = model.predict(x_test)
test_accuracy = accuracy_score(y_test, test_predicition)
print(f"the train predicition is {test_accuracy}")

the train predicition is 0.9505813953488372
