# Text Data Encoding

In [None]:
import gensim
# Import Required Libraries
import numpy as np
import pandas as pd
import sklearn.feature_extraction.text as sk_text
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder

In [None]:
# Sample Corpus
corpus = [
    "The cat sat on the mat.",
    "The dog played in the park.",
    "Cats and dogs are great pets."
]

### 1. One-Hot Encoding using Sklearn


In [None]:
vectorizer = CountVectorizer(binary=True)
one_hot_encoded = vectorizer.fit_transform(corpus).toarray()
print("One-Hot Encoded Matrix:")
print(one_hot_encoded)

In [None]:
### Exercise: Make it yourself without using libraries


### 2. Index-Based Encoding

In [None]:
word_to_index = {word: idx for idx, word in enumerate(set(" ".join(corpus).split()))}
index_encoded = [[word_to_index[word] for word in sentence.split()] for sentence in corpus]
print("Index-Based Encoding:")
print(index_encoded)

### 3. Bag of Words (using One Hot Encoding)

In [None]:
bow_vectorizer = CountVectorizer()
bow_matrix = bow_vectorizer.fit_transform(corpus).toarray()
print("\nBag of Words Matrix:")
print(bow_matrix)

### 4. TF-IDF

In [None]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus).toarray()
print("\nTF-IDF Matrix:")
print(tfidf_matrix)

In [None]:
### Exercise: Make it yourself without using libraries


### 5. Word2Vec Implementation

In [None]:
# Tokenizing sentences
sentences = [sentence.lower().split() for sentence in corpus]

In [None]:
# Train Word2Vec model
word2vec_model = Word2Vec(sentences, vector_size=10, window=2, min_count=1, workers=4)

In [None]:
# Example: Get vector for 'cat'
print("\nWord2Vec Embedding for 'cat':")
print(word2vec_model.wv['cat'])

# Example: Similar words to 'dog'
print("\nWords similar to 'dog':")
print(word2vec_model.wv.most_similar('dog'))

# Exercises

In [None]:
### Exercise (One-Hot Encoding - Sklearn)
"""
Task:
Use CountVectorizer with binary=True to apply one-hot encoding on the following sentences:

corpus = [
    "I love programming.",
    "Programming is fun.",
    "I love fun activities."
]
Print the resulting one-hot encoded matrix.
Print the vocabulary mapping.
"""


In [None]:
### Exercise (TF-IDF - Sklearn + Manual Calculation)
"""
Task:

Compute the TF-IDF matrix using TfidfVectorizer for the following sentences:
corpus = [
    "Machine learning is fascinating.",
    "Deep learning is a subset of machine learning.",
    "Neural networks are used in deep learning."
]
Manually compute Term Frequency (TF) for the word "learning" in each document.
Manually compute Inverse Document Frequency (IDF) for "learning".
Compare the manually computed TF-IDF for "learning" with the value from Sklearn.
"""


In [None]:
### Exercise (Word2Vec - Custom Training & Exploration)
"""
Task:

Train a Word2Vec model using the following corpus:
corpus = [
    "Artificial intelligence is transforming the world.",
    "Machine learning and deep learning are part of AI.",
    "Neural networks power many AI applications."
]
Find and print the vector representation of the word "AI".
Find and print the most similar words to "learning".
Generate a new sentence and infer its most relevant words based on the trained model.
"""
