# Text Data Encoding

In [36]:
# Import Required Libraries
import numpy as np
import pandas as pd
import sklearn.feature_extraction.text as sk_text
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import gensim
from gensim.models import Word2Vec

In [37]:
# Sample Corpus
corpus = [
    "The cat sat on the mat.",
    "The dog played in the park.",
    "Cats and dogs are great pets."
]

### 1. One-Hot Encoding using Sklearn


In [38]:
vectorizer = CountVectorizer(binary=True)
one_hot_encoded = vectorizer.fit_transform(corpus).toarray()
print("One-Hot Encoded Matrix:")
print(one_hot_encoded)

One-Hot Encoded Matrix:
[[0 0 1 0 0 0 0 0 1 1 0 0 0 1 1]
 [0 0 0 0 1 0 0 1 0 0 1 0 1 0 1]
 [1 1 0 1 0 1 1 0 0 0 0 1 0 0 0]]


In [39]:
### Exercise: Make it yourself without using libraries
def tokenize(text):
    return text.lower().replace(".", "").replace(",", "").split()

def build_vocab(corpus):
    vocab = set()
    for sentence in corpus:
        vocab.update(tokenize(sentence))
    return sorted(vocab)  # Sorting to ensure consistent ordering

def one_hot_encoding(corpus):
    vocab = build_vocab(corpus)
    word_to_index = {word: idx for idx, word in enumerate(vocab)}

    one_hot_vectors = []
    for sentence in corpus:
        tokenized_sentence = tokenize(sentence)
        vector = [1 if word in tokenized_sentence else 0 for word in vocab]
        one_hot_vectors.append(vector)

    return vocab, one_hot_vectors

vocab, one_hot_vectors = one_hot_encoding(corpus)
print("Vocabulary:", vocab)
print("One-hot encoded vectors:")
for vector in one_hot_vectors:
    print(vector)


Vocabulary: ['and', 'are', 'cat', 'cats', 'dog', 'dogs', 'great', 'in', 'mat', 'on', 'park', 'pets', 'played', 'sat', 'the']
One-hot encoded vectors:
[0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1]
[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1]
[1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0]


### 2. Index-Based Encoding

In [40]:
word_to_index = {word: idx for idx, word in enumerate(set(" ".join(corpus).split()))}
index_encoded = [[word_to_index[word] for word in sentence.split()] for sentence in corpus]
print("Index-Based Encoding:")
print(index_encoded)

Index-Based Encoding:
[[7, 9, 14, 1, 2, 6], [7, 8, 15, 13, 2, 12], [4, 0, 5, 10, 11, 3]]


### 3. Bag of Words (using One Hot Encoding)

In [41]:
bow_vectorizer = CountVectorizer()
bow_matrix = bow_vectorizer.fit_transform(corpus).toarray()
print("\nBag of Words Matrix:")
print(bow_matrix)


Bag of Words Matrix:
[[0 0 1 0 0 0 0 0 1 1 0 0 0 1 2]
 [0 0 0 0 1 0 0 1 0 0 1 0 1 0 2]
 [1 1 0 1 0 1 1 0 0 0 0 1 0 0 0]]


### 4. TF-IDF

In [42]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus).toarray()
print("\nTF-IDF Matrix:")
print(tfidf_matrix)


TF-IDF Matrix:
[[0.         0.         0.39798027 0.         0.         0.
  0.         0.         0.39798027 0.39798027 0.         0.
  0.         0.39798027 0.60534851]
 [0.         0.         0.         0.         0.39798027 0.
  0.         0.39798027 0.         0.         0.39798027 0.
  0.39798027 0.         0.60534851]
 [0.40824829 0.40824829 0.         0.40824829 0.         0.40824829
  0.40824829 0.         0.         0.         0.         0.40824829
  0.         0.         0.        ]]


In [43]:
### Exercise: Make it yourself without using libraries
import math

def compute_tf(corpus, vocab):
    """Computes term frequency (TF) for each document."""
    tf_list = []
    for sentence in corpus:
        tokenized_sentence = tokenize(sentence)
        tf_dict = {word: tokenized_sentence.count(word) / len(tokenized_sentence) for word in vocab}
        tf_list.append(tf_dict)
    return tf_list

def compute_idf(corpus, vocab):
    """Computes inverse document frequency (IDF)."""
    num_docs = len(corpus)
    idf_dict = {}
    for word in vocab:
        doc_count = sum(1 for sentence in corpus if word in tokenize(sentence))
        idf_dict[word] = math.log((num_docs + 1) / (doc_count + 1)) + 1  # Smoothing
    return idf_dict

def compute_tfidf(corpus):
    """Computes TF-IDF vectors for the corpus."""
    vocab = build_vocab(corpus)
    tf_list = compute_tf(corpus, vocab)
    idf_dict = compute_idf(corpus, vocab)

    tfidf_vectors = []
    for tf_dict in tf_list:
        tfidf_vectors.append([tf_dict[word] * idf_dict[word] for word in vocab])

    return vocab, tfidf_vectors

vocab, tfidf_vectors = compute_tfidf(corpus)
print("Vocabulary:", vocab, "\n")
print("TF-IDF encoded vectors:\n")
for vector in tfidf_vectors:
    print(vector, "\n")

Vocabulary: ['and', 'are', 'cat', 'cats', 'dog', 'dogs', 'great', 'in', 'mat', 'on', 'park', 'pets', 'played', 'sat', 'the'] 

TF-IDF encoded vectors:

[0.0, 0.0, 0.2821911967599909, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2821911967599909, 0.2821911967599909, 0.0, 0.0, 0.0, 0.2821911967599909, 0.42922735748392693] 

[0.0, 0.0, 0.0, 0.0, 0.2821911967599909, 0.0, 0.0, 0.2821911967599909, 0.0, 0.0, 0.2821911967599909, 0.0, 0.2821911967599909, 0.0, 0.42922735748392693] 

[0.2821911967599909, 0.2821911967599909, 0.0, 0.2821911967599909, 0.0, 0.2821911967599909, 0.2821911967599909, 0.0, 0.0, 0.0, 0.0, 0.2821911967599909, 0.0, 0.0, 0.0] 



### 5. Word2Vec Implementation

In [44]:
# Tokenizing sentences
sentences = [sentence.lower().split() for sentence in corpus]

In [45]:
# Train Word2Vec model
word2vec_model = Word2Vec(sentences, vector_size=10, window=2, min_count=1, workers=4)

In [46]:
# Example: Get vector for 'cat'
print("\nWord2Vec Embedding for 'cat':")
print(word2vec_model.wv['cat'])

# Example: Similar words to 'dog'
print("\nWords similar to 'dog':")
print(word2vec_model.wv.most_similar('dog'))


Word2Vec Embedding for 'cat':
[ 0.07898068 -0.06989504 -0.09155865 -0.00355753 -0.03099841  0.07894317
  0.05938574 -0.01545663  0.01510963  0.01790041]

Words similar to 'dog':
[('great', 0.4705304801464081), ('the', 0.4318247437477112), ('and', 0.22384949028491974), ('are', 0.19903580844402313), ('mat.', 0.09823879599571228), ('park.', 0.05098552256822586), ('pets.', 0.03376540169119835), ('dogs', 0.026828058063983917), ('in', -0.08308950811624527), ('on', -0.09835172444581985)]


# Exercises

In [47]:
### Exercise (One-Hot Encoding - Sklearn)
"""
Task:
Use CountVectorizer with binary=True to apply one-hot encoding on the following sentences:

corpus = [
    "I love programming.",
    "Programming is fun.",
    "I love fun activities."
]
Print the resulting one-hot encoded matrix.
Print the vocabulary mapping.
"""
from sklearn.feature_extraction.text import CountVectorizer

# Corpus
corpus = [
    "I love programming.",
    "Programming is fun.",
    "I love fun activities."
]

# Initialize CountVectorizer with binary=True
vectorizer = CountVectorizer(binary=True)

# Fit and transform the corpus
one_hot_matrix = vectorizer.fit_transform(corpus).toarray()

# Get the vocabulary mapping
vocab = vectorizer.vocabulary_

# Print results
print("One-hot encoded matrix:")
print(one_hot_matrix)
print("\nVocabulary mapping:\n")
print(vocab)

One-hot encoded matrix:
[[0 0 0 1 1]
 [0 1 1 0 1]
 [1 1 0 1 0]]

Vocabulary mapping:

{'love': 3, 'programming': 4, 'is': 2, 'fun': 1, 'activities': 0}


In [48]:
### Exercise (TF-IDF - Sklearn + Manual Calculation)
"""
Task:

Compute the TF-IDF matrix using TfidfVectorizer for the following sentences:
corpus = [
    "Machine learning is fascinating.",
    "Deep learning is a subset of machine learning.",
    "Neural networks are used in deep learning."
]
Manually compute Term Frequency (TF) for the word "learning" in each document.
Manually compute Inverse Document Frequency (IDF) for "learning".
Compare the manually computed TF-IDF for "learning" with the value from Sklearn.
"""

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import math

# Define the corpus
corpus = [
    "Machine learning is fascinating.",
    "Deep learning is a subset of machine learning.",
    "Neural networks are used in deep learning."
]

print("## Step 1: Compute TF-IDF matrix using TfidfVectorizer")

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)

print("TF-IDF matrix shape:", tfidf_matrix.shape)
print("Feature names:", vectorizer.get_feature_names_out())

print("\n## Step 2: Manually compute Term Frequency (TF) for 'learning'")

def compute_tf(word, doc):
    words = doc.lower().split()
    return words.count(word.lower()) / len(words)

tf_learning = [compute_tf("learning", doc) for doc in corpus]

for i, tf in enumerate(tf_learning):
    print(f"TF('learning') in document {i+1}: {tf:.4f}")

print("\n## Step 3: Manually compute Inverse Document Frequency (IDF) for 'learning'")

def compute_idf(word, corpus):
    N = len(corpus)
    n = sum(1 for doc in corpus if word.lower() in doc.lower())
    return (math.log(N + 1 / n + 1)) + 1

idf_learning = compute_idf("learning", corpus)
print(f"IDF('learning'): {idf_learning:.4f}")

print("\n## Step 4: Manually compute TF-IDF for 'learning'")

tfidf_learning_manual = [tf * idf_learning for tf in tf_learning]

for i, tfidf in enumerate(tfidf_learning_manual):
    print(f"TF-IDF('learning') in document {i+1}: {tfidf:.4f}")

print("\n## Step 5: Compare with sklearn's output")

feature_names = vectorizer.get_feature_names_out()
learning_index = list(feature_names).index('learning')
sklearn_tfidf_values = tfidf_matrix.toarray()[:, learning_index]

print("Sklearn TF-IDF values for 'learning':")
for i, value in enumerate(sklearn_tfidf_values):
    print(f"Document {i+1}: {value:.4f}")

print("Manual TF-IDF values:")
print(tfidf_learning_manual)
print("\nSklearn TF-IDF values:")
print(sklearn_tfidf_values)

print("\nDifferences:")
for i, (manual, sklearn) in enumerate(zip(tfidf_learning_manual, sklearn_tfidf_values)):
    print(f"Document {i+1}: {sklearn - manual:.4f}")

## Step 1: Compute TF-IDF matrix using TfidfVectorizer
TF-IDF matrix shape: (3, 12)
Feature names: ['are' 'deep' 'fascinating' 'in' 'is' 'learning' 'machine' 'networks'
 'neural' 'of' 'subset' 'used']

## Step 2: Manually compute Term Frequency (TF) for 'learning'
TF('learning') in document 1: 0.2500
TF('learning') in document 2: 0.1250
TF('learning') in document 3: 0.0000

## Step 3: Manually compute Inverse Document Frequency (IDF) for 'learning'
IDF('learning'): 2.4663

## Step 4: Manually compute TF-IDF for 'learning'
TF-IDF('learning') in document 1: 0.6166
TF-IDF('learning') in document 2: 0.3083
TF-IDF('learning') in document 3: 0.0000

## Step 5: Compare with sklearn's output
Sklearn TF-IDF values for 'learning':
Document 1: 0.3731
Document 2: 0.5215
Document 3: 0.2426
Manual TF-IDF values:
[0.6165842671983568, 0.3082921335991784, 0.0]

Sklearn TF-IDF values:
[0.37311881 0.52150095 0.2425937 ]

Differences:
Document 1: -0.2435
Document 2: 0.2132
Document 3: 0.2426


In [49]:
### Exercise (Word2Vec - Custom Training & Exploration)
"""
Task:

Train a Word2Vec model using the following corpus:
corpus = [
    "Artificial intelligence is transforming the world.",
    "Machine learning and deep learning are part of AI.",
    "Neural networks power many AI applications."
]
Find and print the vector representation of the word "AI".
Find and print the most similar words to "learning".
Generate a new sentence and infer its most relevant words based on the trained model.
"""
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import numpy as np

print("## Step 1: Prepare the corpus and train the Word2Vec model")

corpus = [
    "Artificial intelligence is transforming the world.",
    "Machine learning and deep learning are part of AI.",
    "Neural networks power many AI applications."
]

# Preprocess the corpus
processed_corpus = [simple_preprocess(sentence) for sentence in corpus]

# Train the Word2Vec model
model = Word2Vec(sentences=processed_corpus, vector_size=100, window=5, min_count=1, workers=4)


print("\n## Step 2: Find and print the vector representation of the word 'AI'")

try:
    ai_vector = model.wv['ai']
    print("Vector representation of 'AI':")
    print(ai_vector)
except KeyError:
    print("The word 'AI' is not in the vocabulary. Try 'artificial' or 'intelligence' instead.")
    print("Vector representation of 'artificial':", model.wv['artificial'])

print("\n## Step 3: Find and print the most similar words to 'learning'")

try:
    similar_words = model.wv.most_similar('learning', topn=3)
    print("Most similar words to 'learning':")
    for word, score in similar_words:
        print(f"{word}: {score:.4f}")
except KeyError:
    print("The word 'learning' is not in the vocabulary or doesn't have enough context.")

print("\n## Step 4: Generate a new sentence and infer its most relevant words")

new_sentence = "AI systems can process and analyze data quickly."
processed_sentence = simple_preprocess(new_sentence)

# Calculate the average vector for the sentence
sentence_vector = np.mean([model.wv[word] for word in processed_sentence if word in model.wv], axis=0)

# Find the most similar words to the sentence vector
most_relevant_words = model.wv.similar_by_vector(sentence_vector, topn=5)

print("Generated sentence:", new_sentence)
print("Most relevant words based on the trained model:")
for word, score in most_relevant_words:
    print(f"{word}: {score:.4f}")

## Step 1: Prepare the corpus and train the Word2Vec model

## Step 2: Find and print the vector representation of the word 'AI'
Vector representation of 'AI':
[-5.3622725e-04  2.3643136e-04  5.1033497e-03  9.0092728e-03
 -9.3029495e-03 -7.1168090e-03  6.4588725e-03  8.9729885e-03
 -5.0154282e-03 -3.7633716e-03  7.3805046e-03 -1.5334714e-03
 -4.5366134e-03  6.5540518e-03 -4.8601604e-03 -1.8160177e-03
  2.8765798e-03  9.9187379e-04 -8.2852151e-03 -9.4488179e-03
  7.3117660e-03  5.0702621e-03  6.7576934e-03  7.6286553e-04
  6.3508903e-03 -3.4053659e-03 -9.4640139e-04  5.7685734e-03
 -7.5216377e-03 -3.9361035e-03 -7.5115822e-03 -9.3004224e-04
  9.5381187e-03 -7.3191668e-03 -2.3337686e-03 -1.9377411e-03
  8.0774371e-03 -5.9308959e-03  4.5162440e-05 -4.7537340e-03
 -9.6035507e-03  5.0072931e-03 -8.7595852e-03 -4.3918253e-03
 -3.5099984e-05 -2.9618145e-04 -7.6612402e-03  9.6147433e-03
  4.9820580e-03  9.2331432e-03 -8.1579173e-03  4.4957981e-03
 -4.1370760e-03  8.2453608e-04  8.4986202e-03 -