<a href="https://colab.research.google.com/github/eyash24/Data-Science/blob/main/NLP_101_Spam_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Spam Classification
Following Krish Naik video:
https://www.youtube.com/live/g-Y5a4WDe7g?si=iFEDmKcVkesIoeK8

In [1]:
# downloading the dataset
! wget https://raw.githubusercontent.com/krishnaik06/NLP-Live/main/smsspamcollection/SMSSpamCollection

--2024-06-25 11:43:36--  https://raw.githubusercontent.com/krishnaik06/NLP-Live/main/smsspamcollection/SMSSpamCollection
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 477907 (467K) [text/plain]
Saving to: ‘SMSSpamCollection’


2024-06-25 11:43:37 (6.74 MB/s) - ‘SMSSpamCollection’ saved [477907/477907]



In [2]:
# importing the dataset
import pandas as pd
messages = pd.read_csv('/content/SMSSpamCollection', sep="\t", names=['label', 'message'])
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
messages.shape

(5572, 2)

In [4]:
messages['message'].iloc[100]

"Please don't text me anymore. I have nothing else to say."

In [5]:
# Data Cleaning and Preprocessing
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [10]:
corpus = []
for i in range(0, len(messages)):
  review = re.sub('[^a-zA-Z0-9]', " ", messages['message'][i])
  review = review.lower()
  review = review.split()

  review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
  review = " ".join(review)
  corpus.append(review)

In [11]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl 3 week word back like fun still tb ok xxx std chg send 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner valu network custom select receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash 100 20 000 pound txt csh11 send 87575 cost 150p day 6day 16 tsandc appli repli hl 4 info',
 'urgent 1 week free mem

In [16]:
# creating Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500, binary=True, ngram_range=(2,2))
X = cv.fit_transform(corpus).toarray()

In [17]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [19]:
X.shape

(5572, 2500)

In [22]:
y = pd.get_dummies(messages['label'], dtype=int)
y = y.iloc[:, 1].values.astype(int)

In [23]:
y

array([0, 0, 1, ..., 0, 0, 0])

In [24]:
# Train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [25]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [26]:
y_train

array([0, 0, 0, ..., 1, 0, 0])

In [30]:
from sklearn.naive_bayes import MultinomialNB
spam_detection_model = MultinomialNB().fit(X_train, y_train)

In [31]:
# prediction
y_pred = spam_detection_model.predict(X_test)

In [32]:
from sklearn.metrics import accuracy_score, classification_report

In [34]:
score = accuracy_score(y_test, y_pred)
print(score)

0.9721973094170404


In [35]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       1.00      0.97      0.98       986
           1       0.81      1.00      0.89       129

    accuracy                           0.97      1115
   macro avg       0.90      0.98      0.94      1115
weighted avg       0.98      0.97      0.97      1115



In [44]:
# Creating the TF-IDF model
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=2500, ngram_range=(1,2))
X = tv.fit_transform(corpus).toarray()

In [45]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [46]:
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [47]:
# prediction
y_pred = spam_detect_model.predict(X_test)

In [48]:
# accuracy
score = accuracy_score(y_test, y_pred)
score

0.9811659192825112

In [49]:
# classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       955
           1       1.00      0.87      0.93       160

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [51]:
# Creating Random Forest Classifier model
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

In [52]:
# prediction
y_pred = classifier.predict(X_test)

In [53]:
# accuracy
accuracy_score(y_test, y_pred)

0.9829596412556054

In [54]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       955
           1       1.00      0.88      0.94       160

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



# Word2Vec Implementation

In [55]:
! pip install gensim



In [63]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')



In [58]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [60]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [61]:
corpus = []
for i in range(0, len(messages)):
  review = re.sub('[^a-zA-Z0-9]', ' ', messages['message'][i])
  review = review.lower()
  review = review.split()

  review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
  review = " ".join(review)
  corpus.append(review)

In [62]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [70]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [71]:
words = []
for sent in corpus:
  sent_token = sent_tokenize(sent)
  for sent in sent_token:
    words.append(simple_preprocess(sent))

simple_preprocess -> converts sent into lowercase and many more

In [72]:
import gensim

In [74]:
# Training word2vec from scratch
model = gensim.models.Word2Vec(words, window=5, min_count=2)

In [75]:
# Printing vocabulary
model.wv.index_to_key

['call',
 'get',
 'ur',
 'gt',
 'go',
 'lt',
 'ok',
 'day',
 'free',
 'know',
 'come',
 'like',
 'good',
 'time',
 'got',
 'text',
 'love',
 'want',
 'send',
 'need',
 'one',
 'txt',
 'today',
 'going',
 'stop',
 'home',
 'lor',
 'sorry',
 'see',
 'still',
 'mobile',
 'take',
 'back',
 'da',
 'reply',
 'dont',
 'think',
 'tell',
 'week',
 'phone',
 'hi',
 'new',
 'later',
 'please',
 'pls',
 'co',
 'msg',
 'make',
 'dear',
 'night',
 'message',
 'well',
 'say',
 'min',
 'thing',
 'much',
 'great',
 'claim',
 'hope',
 'oh',
 'hey',
 'give',
 'number',
 'happy',
 'friend',
 'work',
 'wat',
 'way',
 'yes',
 'www',
 'let',
 'prize',
 'right',
 'tomorrow',
 'already',
 'ask',
 'said',
 'win',
 'amp',
 'life',
 'cash',
 'yeah',
 'im',
 'tone',
 'really',
 'babe',
 'meet',
 'find',
 'miss',
 'morning',
 'uk',
 'last',
 'service',
 'thanks',
 'care',
 'com',
 'would',
 'anything',
 'year',
 'also',
 'lol',
 'nokia',
 'every',
 'feel',
 'keep',
 'sure',
 'pick',
 'urgent',
 'contact',
 'sent',


In [76]:
# total vocabulary size
model.corpus_count

5565

In [77]:
# printing the epochs
model.epochs

5

In [78]:
model.wv.similar_by_word('prize')

[('claim', 0.999395489692688),
 ('call', 0.9992626905441284),
 ('line', 0.9991713762283325),
 ('cash', 0.999157190322876),
 ('number', 0.999113142490387),
 ('show', 0.9990891814231873),
 ('draw', 0.9990622401237488),
 ('guaranteed', 0.9990119338035583),
 ('free', 0.9990065693855286),
 ('contact', 0.9989762902259827)]

In [79]:
model.wv['kid'].shape

(100,)

In [80]:
import numpy as np

In [81]:
def avg_word2vec(doc):
  return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key], axis=0)

In [82]:
!pip install tqdm



In [83]:
from tqdm import tqdm

In [84]:
words[73]

['performed']

In [85]:
type(model.wv.index_to_key)

list

In [86]:
# applying for entire sentences
X = []
for i in tqdm(range(len(words))):
  X.append(avg_word2vec(words[i]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 5565/5565 [00:03<00:00, 1449.66it/s]


In [87]:
type(X)

list

In [88]:
X

[array([-0.10639972,  0.27915564,  0.04957172,  0.00338919,  0.03297142,
        -0.37835136,  0.06178258,  0.50916135, -0.14628471, -0.11608765,
        -0.08752733, -0.38886654, -0.09269155,  0.04913737,  0.04883617,
        -0.23033957, -0.02660399, -0.38144   , -0.02186322, -0.41899756,
         0.16189168,  0.10181273,  0.11138576, -0.04945627, -0.15303837,
         0.03406858, -0.20634006, -0.2873579 , -0.2448364 ,  0.08834312,
         0.32005545,  0.01591227,  0.06933794, -0.14076218, -0.10576739,
         0.2396593 , -0.03922768, -0.23830044, -0.16124058, -0.48908734,
         0.03219137, -0.30249333, -0.01660334,  0.02433989,  0.21138667,
        -0.14850478, -0.1968509 ,  0.03178984,  0.12454583,  0.1343838 ,
         0.10612267, -0.24428964, -0.0143962 , -0.06963321, -0.21272163,
         0.0998437 ,  0.10519406, -0.00332478, -0.26477358, -0.04812576,
         0.03455847,  0.11084343, -0.12604196,  0.01922428, -0.3365245 ,
         0.23027456,  0.11572085,  0.24361034, -0.3

In [92]:
X_new = np.asarray(X, dtype="object")

In [93]:
X_new[0]

array([-0.10639972,  0.27915564,  0.04957172,  0.00338919,  0.03297142,
       -0.37835136,  0.06178258,  0.50916135, -0.14628471, -0.11608765,
       -0.08752733, -0.38886654, -0.09269155,  0.04913737,  0.04883617,
       -0.23033957, -0.02660399, -0.38144   , -0.02186322, -0.41899756,
        0.16189168,  0.10181273,  0.11138576, -0.04945627, -0.15303837,
        0.03406858, -0.20634006, -0.2873579 , -0.2448364 ,  0.08834312,
        0.32005545,  0.01591227,  0.06933794, -0.14076218, -0.10576739,
        0.2396593 , -0.03922768, -0.23830044, -0.16124058, -0.48908734,
        0.03219137, -0.30249333, -0.01660334,  0.02433989,  0.21138667,
       -0.14850478, -0.1968509 ,  0.03178984,  0.12454583,  0.1343838 ,
        0.10612267, -0.24428964, -0.0143962 , -0.06963321, -0.21272163,
        0.0998437 ,  0.10519406, -0.00332478, -0.26477358, -0.04812576,
        0.03455847,  0.11084343, -0.12604196,  0.01922428, -0.3365245 ,
        0.23027456,  0.11572085,  0.24361034, -0.33558497,  0.30

In [96]:
words[0]

['go',
 'jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat']

In [95]:
X_new[0].shape

(100,)

Next step:
* train test split
* model creation and fitting
* accuracy and classification_report