In [133]:
import string
import nltk
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [199]:
text = '''
i like dogs.
I like cats.
we like cats and dogs.
'''

In [200]:
# manual vectorization using bow
sentences = nltk.sent_tokenize(text)

trail = []
for i in range(len(sentences)):
    trail = [word.lower() for word in nltk.word_tokenize(sentences[i]) if word != '.' and word not in set(stopwords.words('english'))]
    sentences[i] = ' '.join(trail)
    trail = []

text = text.translate(str.maketrans("", "", string.punctuation))
all_words = re.split(r'\s+|\n', text)
all_words = [word.lower() for word in all_words if word and word.lower() not in set(stopwords.words('english'))]

bow = {word: all_words.count(word) for word in all_words}

sorted_set_representation = sorted(bow.items(), key=lambda item: item[1], reverse=True)
sorted_keys = [item[0] for item in sorted_set_representation]

matrix = np.zeros((len(sentences), len(sorted_keys)))
adjusted = []

for i in range(1,len(sentences)+1):
    adjusted = sorted(nltk.word_tokenize(sentences[i-1]), key=lambda word: bow.get(word, 0), reverse=True)
    matrix[i-1:i] = [1 if word in adjusted else 0 for word in sorted_keys]
    adjusted = []

matrix

array([[1., 1., 0.],
       [1., 0., 1.],
       [1., 1., 1.]])

Advantages:
- simple and intuitive;
- fixed size input, which is useful for ML algorithm training;
  
Disadvantage:
- sparse matrix is still there, which will lead to overfitting;
- ordering of the wordds is changing, thus the meaning of words is changing aswell;
- out of vocab words problem is still present;
- semantic information is not captured, since we are using 0 or 1;

## BOW using NLTK

In [284]:
import re
import os
import nltk
import pandas as pd
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

lem = WordNetLemmatizer()
# here we will define feature_size, which is number of words, in this case we are takingtop 250 words as the 'most relevant'
# also we are defning that we want to use the binary version of BOW
# another useful variable that is used in countvectorizer is n_gram, which specifies how far with n_grams we want to go
# this n_gram part is super important since now we start to look into the context of the sentence by adjusting the vector to have n_gram parts
# countvecorizer tuning is basically us trying to find new hyperparameters
vectorizer = CountVectorizer(max_features=500, binary=True, ngram_range=(2,3))
stop_words = set(stopwords.words('english'))

In [244]:
# reading the data set and configuring it to be correct
data_dir = 'datasets'
dataset_name = 'spam.csv'
full_path = os.path.join(data_dir,dataset_name)

data = pd.read_csv(full_path, sep = ',',
                    encoding='ISO-8859-1')

data = data.iloc[: , :2]
data = data.rename(columns={'v1': 'label', 'v2': 'message'})
data

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [247]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [248]:
corpus = []

for i in range(0, len(data)):
    # Remove non-alphabetic characters and lower the text
    review = re.sub('[^a-zA-Z]', ' ', data['message'][i])
    review = review.lower()
    review = review.split()

    # Perform POS tagging
    pos_tags = nltk.pos_tag(review)

    # Lemmatize words based on POS tags, exclude stopwords
    lemmatized_review = []
    for word, tag in pos_tags:
        if word not in stop_words:
            wordnet_pos = get_wordnet_pos(tag) or wordnet.NOUN  # Default to NOUN
            lemmatized_review.append(lem.lemmatize(word, pos=wordnet_pos))

    # Join processed words back into a sentence
    lemmatized_review = ' '.join(lemmatized_review)
    corpus.append(lemmatized_review)


In [249]:
corpus

['go jurong point crazy available bugis n great world la e buffet cine get amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'nah think go usf live around though',
 'freemsg hey darling week word back like fun still tb ok xxx std chgs send rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press copy friend callertune',
 'winner value network customer select receivea prize reward claim call claim code kl valid hour',
 'mobile month u r entitle update late colour mobile camera free call mobile update co free',
 'gonna home soon want talk stuff anymore tonight k cry enough today',
 'six chance win cash pound txt csh send cost p day day tsandcs apply reply hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw',
 'sear

In [285]:
# lets create the bow model
X = vectorizer.fit_transform(corpus).toarray()

In [286]:
vectorizer.vocabulary_

{'free entry': np.int64(136),
 'rate apply': np.int64(335),
 'think go': np.int64(411),
 'per request': np.int64(297),
 'claim call': np.int64(67),
 'call claim': np.int64(29),
 'claim code': np.int64(68),
 'call claim code': np.int64(30),
 'update late': np.int64(438),
 'late colour': np.int64(213),
 'free call': np.int64(133),
 'call mobile': np.int64(43),
 'chance win': np.int64(65),
 'win cash': np.int64(483),
 'chance win cash': np.int64(66),
 'txt word': np.int64(430),
 'dont miss': np.int64(111),
 'let know': np.int64(217),
 'feel like': np.int64(126),
 'yeah get': np.int64(498),
 'reply yes': np.int64(343),
 'go home': np.int64(161),
 'anything lor': np.int64(5),
 'call reply': np.int64(49),
 'mobile free': np.int64(259),
 'free camcorder': np.int64(134),
 'please call': np.int64(305),
 'delivery tomorrow': np.int64(107),
 'lt gt': np.int64(234),
 'miss call': np.int64(256),
 'want go': np.int64(470),
 'first time': np.int64(132),
 'like lt': np.int64(220),
 'like lt gt': np.in

In [287]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])