### One hot encoding:

In [10]:
# define vocabulary
vocabulary = [ "I", "work" ,"at", "Fusemachines"]

encoded_vocab = []
for each in vocabulary:
    one_hot_encoded = [1 if word == each else 0 for word in vocabulary]
    encoded_vocab.append(one_hot_encoded)

print(encoded_vocab)


[[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]


### Index Based Encoding:


In [11]:
# define the corpus
corpus = [
  'I like working at Fusemachines',
  'Fusemachines has a skilled team',
  'Good career opportunities are here'
]

vocabulary=[]
# Tokenize the sentences into words
for sentence in corpus:
    words = sentence.split()
    for word in words:
        if word not in vocabulary:
            vocabulary.append(word)


# Create a vocabulary with unique words
print(vocabulary)

# Build a word-to-index mapping
word_to_index = {word: idx+1 for idx, word in enumerate(vocabulary)}
print(word_to_index)


# # Encode the sentences using word indices
encoded_corpus = []
for sentence in corpus:
    encoded_sentence = [word_to_index[word] for word in sentence.split()]
    encoded_corpus.append(encoded_sentence)

print(encoded_corpus)

# Print the encoded corpus
for encoded_sentence in encoded_corpus:
    print(encoded_sentence)



['I', 'like', 'working', 'at', 'Fusemachines', 'has', 'a', 'skilled', 'team', 'Good', 'career', 'opportunities', 'are', 'here']
{'I': 1, 'like': 2, 'working': 3, 'at': 4, 'Fusemachines': 5, 'has': 6, 'a': 7, 'skilled': 8, 'team': 9, 'Good': 10, 'career': 11, 'opportunities': 12, 'are': 13, 'here': 14}
[[1, 2, 3, 4, 5], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14]]
[1, 2, 3, 4, 5]
[5, 6, 7, 8, 9]
[10, 11, 12, 13, 14]


### Bag of words (BOW)

In [13]:
from nltk.corpus import stopwords



# Initialize an empty vocabulary list
vocabulary = []

# Tokenize the sentences into words and build the vocabulary
for sentence in corpus:
    words = sentence.split()
    # removing stop words before creating bow
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]

    for word in filtered_words:
        if word not in vocabulary:
            vocabulary.append(word)

# Create a dictionary to map words to their indices in the vocabulary
word_to_index = {word: idx for idx, word in enumerate(vocabulary)}

# Function to compute Bag of Words (BoW) representation of a sentence
def bow(sentence):
    '''
    Function that make the bag of words of a given sentence.
    '''

    # Initialize a vector of zeros with the same length as the vocabulary
    bow_vector = [0] * len(vocabulary)
    # Tokenize the sentence into words
    words = sentence.split()
    # removing stop words before creating bow
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]

    # Count word occurrences and update the BoW vector
    for word in filtered_words:
        if word in word_to_index:
            bow_vector[word_to_index[word]] += 1
    return bow_vector

# Compute the BoW representation for the entire corpus
corpus_bow = [bow(sentence) for sentence in corpus]

# Print the vocabulary and BoW representation for each sentence
print("Vocabulary:", vocabulary)
for idx, sentence in enumerate(corpus):
    print(f"Sentence {idx + 1} BoW:", corpus_bow[idx])

Vocabulary: ['like', 'working', 'Fusemachines', 'skilled', 'team', 'Good', 'career', 'opportunities']
Sentence 1 BoW: [1, 1, 1, 0, 0, 0, 0, 0]
Sentence 2 BoW: [0, 0, 1, 1, 1, 0, 0, 0]
Sentence 3 BoW: [0, 0, 0, 0, 0, 1, 1, 1]
