In [1]:
document_corpus = ["these are beautiful flowers flowers",
                   "those are ugly cars cars",
                    "it is a fast car car",
                    "she has a cute dog dog",
                    "this pizza is delicious delicious"]

In [2]:
data_corpus = set()
for row in document_corpus:
    for word in row.split(" "):
        if word not in data_corpus:
            data_corpus.add(word)

data_corpus=sorted(data_corpus)

print(data_corpus)

['a', 'are', 'beautiful', 'car', 'cars', 'cute', 'delicious', 'dog', 'fast', 'flowers', 'has', 'is', 'it', 'pizza', 'she', 'these', 'this', 'those', 'ugly']


## Index Based Encoding :

In [3]:
res = len(max(document_corpus, key = len).split(" "))
print(res)

5


In [4]:
index_based_encoding=[]
for row in document_corpus:
    row_encoding = []
    split = row.split(" ")
    for i in range(res):
        if i <= len(split)-1:
            row_encoding.append(data_corpus.index(split[i])+1)
        else:
            row_encoding.append(0)
    index_based_encoding.append(row_encoding)

print(index_based_encoding)

[[16, 2, 3, 10, 10], [18, 2, 19, 5, 5], [13, 12, 1, 9, 4], [15, 11, 1, 6, 8], [17, 14, 12, 7, 7]]


# Bag of Words
### 1. Binary BoW

In [6]:
one_hot_encoding = []
for row in document_corpus:
    row_encoding = []
    split = row.split(" ")
    for word in data_corpus:
        if word in split:
            row_encoding.append(1)
        else:
            row_encoding.append(0)
    one_hot_encoding.append(row_encoding)

print(one_hot_encoding)

[[0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], [1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0]]


### 2. BoW

In [7]:
one_hot_encoding = []
for row in document_corpus:
    row_encoding = []
    split = row.split(" ")
    for word in data_corpus:
        count = split.count(word)
        if word in split:
            row_encoding.append(count)
        else:
            row_encoding.append(count)
    one_hot_encoding.append(row_encoding)

print(one_hot_encoding)

[[0, 1, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], [1, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 1, 0, 2, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0]]


## TF-IDF Encoding :

In [8]:
tf_dict = {}
i=0
for row in document_corpus:
    row_dict={}
    split = row.split(" ")
    for word in split:
        if word not in row_dict.keys():
            row_dict[word] = split.count(word)
    tf_dict[i] = row_dict
    i+=1

print(tf_dict)

{0: {'these': 1, 'are': 1, 'beautiful': 1, 'flowers': 2}, 1: {'those': 1, 'are': 1, 'ugly': 1, 'cars': 2}, 2: {'it': 1, 'is': 1, 'a': 1, 'fast': 1, 'car': 2}, 3: {'she': 1, 'has': 1, 'a': 1, 'cute': 1, 'dog': 2}, 4: {'this': 1, 'pizza': 1, 'is': 1, 'delicious': 2}}


In [9]:
import math
def calculate_tf(word, sentence_num):
    row_dict = tf_dict[int(sentence_num)]
    return row_dict[word]/sum(row_dict.values())

def calculate_idf(word):
    doc_num = 0
    for key, value in tf_dict.items():
        if word in value.keys():
            doc_num+=1
    return math.log(len(data_corpus)/doc_num+1)

def tf_idf(word, sentence_num):
    return round(calculate_tf(word, sentence_num) * calculate_idf(word),5)

In [12]:
tf_idf('flowers',0)

1.19829

## Scikit-Learn Implementation:

### BoW Encoding

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(document_corpus)
print(vectorizer.get_feature_names_out())


['are' 'beautiful' 'car' 'cars' 'cute' 'delicious' 'dog' 'fast' 'flowers'
 'has' 'is' 'it' 'pizza' 'she' 'these' 'this' 'those' 'ugly']
