In [None]:
import pandas as pd
import numpy as np

In [None]:
import pandas as pd
try:
    dataset = pd.read_csv('/content/spam (6).csv', sep='\t', names=['label', 'message'], encoding='latin1')
    print(dataset.head())
except UnicodeDecodeError:
    try:
        dataset = pd.read_csv('/content/spam (6).csv', sep='\t', names=['label', 'message'], encoding='ISO-8859-1')
        print(dataset.head())
    except UnicodeDecodeError:
        dataset = pd.read_csv('/content/spam (6).csv', sep='\t', names=['label', 'message'], encoding='cp1252')
        print(dataset.head())

                                               label  message
0                                           v1,v2,,,      NaN
1  ham,"Go until jurong point, crazy.. Available ...      NaN
2               ham,Ok lar... Joking wif u oni...,,,      NaN
3  spam,Free entry in 2 a wkly comp to win FA Cup...      NaN
4  ham,U dun say so early hor... U c already then...      NaN


In [None]:
#to check null
dataset.info()


In [None]:
# 2 method
dataset.isnull().sum()

In [None]:
dataset.describe()

In [None]:
#give ham 0 and spam 1
dataset['label']=dataset['label'].map({'ham':0, 'spam':1})
dataset

In [None]:
dataset.describe()

In [None]:
#visualize the data
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
plt.figure(figsize=(8,8))
p=sns.countplot(x="label",data=dataset)
p=plt.title('Countplot for Spam vs Ham as imbalanced dataset')
p=plt.xlabel("Is the SMS Spam?")
p=plt.ylabel('Count')

In [None]:
#Handling imbalanced daatset using oversampling
only_spam=dataset[dataset['label']==1]
only_spam

In [None]:
print("No of spam sms: ",len(only_spam))
print("No of ham sms: ",len(dataset) - len(only_spam))


In [None]:
count=int((dataset.shape[0] - only_spam.shape[0])/only_spam.shape[0])
count

In [None]:
for i in range(0,count-1):
  dataset=pd.concat([dataset, only_spam])
dataset.shape

In [None]:
plt.figure(figsize=(8,8))
p=sns.countplot(x="label",data=dataset)
p=plt.title('Countplot for Spam vs Ham as balanced dataset')
p=plt.xlabel("Is the SMS Spam?")
p=plt.ylabel('Count')

In [None]:
#creating new feature word_count
dataset['word_count']=dataset['message'].apply(lambda x: len(x.split()))
dataset

In [None]:
plt.figure(figsize=(12,6))
#(1,1)
plt.subplot(1,2,1)
g=sns.histplot(dataset[dataset["label"]== 0].word_count, kde= True)
p=plt.title("Distribution of word_count for Ham SMS")
#(1,2)
plt.subplot(1,2,2)
g=sns.histplot(dataset[dataset["label"]==1].word_count, color="red", kde=True)
p=plt.title("Distribution of word count for SPAM SMS")
plt.tight_layout()
plt.show()

In [None]:
#Creating new feature of containing currency symbols
def currency (data):
  currency_symbols=['€','$','¥','£ ','₹']
  for i in currency_symbols:
    if 1 in dataset:
      return 1
      return 0


In [None]:
dataset["contains_currency_symbols"] = dataset["message"].apply(currency)
dataset

In [None]:
dataset['contains_currency_symbols'] = dataset['contains_currency_symbols'].notna()


In [None]:
plt.figure(figsize=(8, 8))
g = sns.countplot(x='contains_currency_symbols', data=dataset, hue='label')
plt.title('Countplot for Containing Currency Symbol')
plt.xlabel('Does SMS contain any currency symbol?')
plt.ylabel('Count')
plt.legend(labels=["Ham", "Spam"], loc='upper center')
plt.show()


In [None]:
#Creating new feature of containing numbers
def number (data):
  for i in data:
    if ord(i) >=48 and ord(i) <=57:
      return 1
  return 0


In [None]:
dataset["contains_number"]= dataset['message'].apply(number)
dataset

In [None]:
#Countplot for containing numbers
plt.figure(figsize=(8,8))
g=sns.countplot(x='contains_number', data=dataset, hue= "label")
p=plt.title('Countplot for Containing Numbers')
p=plt.xlabel('Does SMS contains any number?')
p=plt.ylabel('count')
p=plt.legend(labels=["Ham", "Spam"], loc=9)

In [None]:
#Data Cleaning
import nltk
import re
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [None]:

corpus = []
wnl= WordNetLemmatizer()
for sms in list(dataset.message):
  message= re.sub(pattern='[^a-zA-Z]', repl=' ', string =sms) # Flitering out special characters and numbers
  message= message.lower()
  words= message.split() # Tokenizer
  filtered_words= [word for word in words if word not in set (stopwords.words('english'))]
  lemm_words =[wnl.lemmatize(word) for word in filtered_words]
  message=' '.join(lemm_words)
  corpus.append(message)

In [None]:
corpus

In [None]:
#creating the Bag of words model
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(max_features= 500)
vectors= tfidf.fit_transform(corpus).toarray()
feature_names= tfidf.get_feature_names_out()


In [None]:
X=pd.DataFrame(vectors, columns= feature_names)
y=dataset['label']


In [None]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import classification_report, confusion_matrix


In [None]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train

In [None]:
X_test

In [None]:
#Naive Bayes Model
from sklearn.naive_bayes import MultinomialNB
mnb =MultinomialNB()
cv=cross_val_score(mnb, X, y, scoring ='f1', cv=10)
print(round(cv.mean(),3))
print(round(cv.std(),3))


In [None]:

mnb.fit(X_train, y_train)
y_pred=mnb.predict(X_test)


In [None]:
print(classification_report(y_test, y_pred))

In [None]:
cm=confusion_matrix(y_test, y_pred)
cm


In [None]:

plt.figure(figsize=(8,8))
axis_labels=['ham', 'spam']
g=sns.heatmap(data=cm, xticklabels=axis_labels, yticklabels=axis_labels, annot=True, fmt='g',cbar_kws={'shrink':0.5},cmap="Blues")

p=plt.title("Confusion Matrix of Multinomial Navie Bayes Model")
p=plt.xlabel('Actual values')
p=plt.ylabel("Predicted values")