In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

# load dataset with appropriate encoding format

In [4]:
df = pd.read_csv('spam.csv',encoding = 'latin1')

In [5]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [7]:
df.describe

<bound method NDFrame.describe of         v1                                                 v2 Unnamed: 2  \
0      ham  Go until jurong point, crazy.. Available only ...        NaN   
1      ham                      Ok lar... Joking wif u oni...        NaN   
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3      ham  U dun say so early hor... U c already then say...        NaN   
4      ham  Nah I don't think he goes to usf, he lives aro...        NaN   
...    ...                                                ...        ...   
5567  spam  This is the 2nd time we have tried 2 contact u...        NaN   
5568   ham              Will Ì_ b going to esplanade fr home?        NaN   
5569   ham  Pity, * was in mood for that. So...any other s...        NaN   
5570   ham  The guy did some bitching but I acted like i'd...        NaN   
5571   ham                         Rofl. Its true to its name        NaN   

     Unnamed: 3 Unnamed: 4  
0           NaN        N

In [8]:
df['v1'].value_counts

<bound method IndexOpsMixin.value_counts of 0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: v1, Length: 5572, dtype: object>

###### The given dataset contains 5 columns in which the last 3 columns are unwanted so drop those columns

In [9]:
df = df.drop(columns = df.columns[2:5])
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
df.columns = ['Category','Message']
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [12]:
ham = df[df['Category']=='ham']
len(ham)

4825

In [13]:
spam = df[df['Category']=='spam']
len(spam)

747

In [14]:
df['spam'] = df['Category'].apply(lambda x: 1 if x=='spam' else 0)
df

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will Ì_ b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


### preprocess the input data

In [16]:
df.drop_duplicates(inplace = True)

In [17]:
len(df)

5169

In [20]:
df['Label'] = df['Category'].map({'ham':'ham','spam':'spam'})

In [22]:
df.drop(columns = ['label'])

Unnamed: 0,Category,Message,spam,Label
0,ham,"Go until jurong point, crazy.. Available only ...",0,ham
1,ham,Ok lar... Joking wif u oni...,0,ham
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,spam
3,ham,U dun say so early hor... U c already then say...,0,ham
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,ham
...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1,spam
5568,ham,Will Ì_ b going to esplanade fr home?,0,ham
5569,ham,"Pity, * was in mood for that. So...any other s...",0,ham
5570,ham,The guy did some bitching but I acted like i'd...,0,ham


In [23]:
X = df['Message']
Y = df['Label']


In [25]:
X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5169, dtype: object

In [26]:
Y

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: Label, Length: 5169, dtype: object

## Create Tf-idf Vectorizer

In [34]:
tfidf_vectorizer = TfidfVectorizer()

## Fit and transform the vectorizer to training data

In [35]:
xtrain_tfidf = tfidf_vectorizer.fit_transform(x_train)

### split the data

In [27]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size = 0.2,random_state = 42)

### initialize a naive bayes classifier

In [30]:
nb_classifier = MultinomialNB()

### train the classifier

In [36]:
nb_classifier.fit(xtrain_tfidf,y_train)

### transform the test data using the same vectorizer

In [38]:
xtest_tfidf =  tfidf_vectorizer.fit_transform(x_test)

# Make predictions

In [39]:
y_pred = nb_classifier.predict(xtest_tfidf)

ValueError: X has 3362 features, but MultinomialNB is expecting 7657 features as input.