In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

In [5]:
df = pd.read_csv(
    "../DataSets/SMSSpamCollection",
    sep="\t",
    header=None,
    names=["label", "message"]
)
# Encoding Labels
df["label"] = df["label"].map({"ham": 0, "spam": 1})



In [50]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)     # remove URLs
    text = re.sub(r"[^a-z\s]", "", text)           # remove punctuation & numbers
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) 
              for word in tokens if word not in stop_words]
    return " ".join(tokens)

df["clean_message"] = df["message"].apply(clean_text)



In [None]:
corpus = []
for i in range(0, len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['message'][i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review]
    review = ' '.join(review)
    corpus.append(review)

In [53]:
print(type(corpus))
print(len(corpus))
print(corpus)
print("================================")
print(corpus[0])
print(df['clean_message'][0])
print("================================")
print(corpus[1])



<class 'list'>
5572
go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat
go jurong point crazy available bugis n great world la e buffet cine got amore wat
ok lar joking wif u oni


In [56]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
sent=corpus[0]
print(sent)
sent_token=sent_tokenize(sent)
print(sent_token)

print("==================================")

text ="Hello world. This is NLP. Sentence tokenization is useful!"
sentences = sent_tokenize(text)
print(text)
print(sentences)


go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat
['go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat']
Hello world. This is NLP. Sentence tokenization is useful!
['Hello world.', 'This is NLP.', 'Sentence tokenization is useful!']


In [68]:
#print(simple_preprocess(text))
for i in range(10,12):
    words=simple_preprocess(corpus[i])
    print(corpus[i])
    print(words)
    print("===========================================")

i m gonna be home soon and i don t want to talk about this stuff anymore tonight k i ve cried enough today
['gonna', 'be', 'home', 'soon', 'and', 'don', 'want', 'to', 'talk', 'about', 'this', 'stuff', 'anymore', 'tonight', 've', 'cried', 'enough', 'today']
six chance to win cash from to pound txt csh and send to cost p day day tsandcs apply reply hl info
['six', 'chance', 'to', 'win', 'cash', 'from', 'to', 'pound', 'txt', 'csh', 'and', 'send', 'to', 'cost', 'day', 'day', 'tsandcs', 'apply', 'reply', 'hl', 'info']


In [72]:
words=[]
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

In [73]:
print(len(words))
print("=====================")
print(words)

5569


In [74]:
import gensim
## Lets train Word2vec from scratch
model=gensim.models.Word2Vec(words)

In [75]:
## To Get All the Vocabulary
model.wv.index_to_key

['to',
 'you',
 'the',
 'and',
 'it',
 'in',
 'is',
 'me',
 'my',
 'for',
 'your',
 'call',
 'of',
 'that',
 'have',
 'on',
 'now',
 'are',
 'can',
 'so',
 'but',
 'not',
 'or',
 'we',
 'do',
 'get',
 'at',
 'ur',
 'if',
 'will',
 'be',
 'with',
 'no',
 'just',
 'this',
 'gt',
 'lt',
 'go',
 'how',
 'up',
 'when',
 'day',
 'ok',
 'what',
 'free',
 'from',
 'all',
 'out',
 'know',
 'll',
 'come',
 'like',
 'good',
 'time',
 'am',
 'then',
 'got',
 'wa',
 'there',
 'he',
 'love',
 'text',
 'only',
 'want',
 'send',
 'one',
 'need',
 'txt',
 'today',
 'by',
 'going',
 'don',
 'stop',
 'she',
 'home',
 'about',
 'lor',
 'sorry',
 'see',
 'still',
 'mobile',
 'take',
 'back',
 'da',
 'reply',
 'dont',
 'our',
 'think',
 'tell',
 'week',
 'phone',
 'hi',
 'new',
 'they',
 'later',
 'please',
 'any',
 'pls',
 'her',
 'ha',
 'did',
 'co',
 'msg',
 'been',
 'min',
 'an',
 'some',
 'dear',
 'night',
 'make',
 'who',
 'here',
 'message',
 'well',
 'say',
 'where',
 're',
 'thing',
 'much',
 'hope

In [71]:
model.corpus_count

21

In [76]:
model.wv.similar_by_word('only')

[('rate', 0.9992354512214661),
 ('live', 0.9989326596260071),
 ('hr', 0.9989250302314758),
 ('for', 0.9988971948623657),
 ('orange', 0.998816728591919),
 ('per', 0.9987828731536865),
 ('cost', 0.9987578392028809),
 ('com', 0.9987450838088989),
 ('on', 0.9987426996231079),
 ('pobox', 0.9986982941627502)]