In [None]:
# One hot encoding i.e converting the categories into numerical labels.

In [None]:
# Steps to follow:

# Convert Text to lower case
# Tokenize the text
# Get unique words
# Get the integer/position of the words
# create a vector of each word by marking its position as 1 and rest as 0
# create a matrix of the found vectors.


##  Convert Using Sklearn

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Step 1: Convert text to lower case
text = "Can I eat the Pizza"
text = text.lower()
# Step 2: Tokenize the text
tokens = text.split()
print("Tokens:", tokens)
# Step 3: Get unique words
unique_words = list(set(tokens))
print("Unique Words:", unique_words)
# Step 4: Get the integer/position of the words
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(tokens)
print("Word Index:", integer_encoded)

# Step 5: One-hot encoding using scikit-learn
one_hot_encoder = OneHotEncoder(sparse=False)
encoded_matrix = one_hot_encoder.fit_transform(np.array(tokens).reshape(-1, 1))
print("One-Hot Encoded Matrix:\n", encoded_matrix)

Tokens: ['can', 'i', 'eat', 'the', 'pizza']
Unique Words: ['can', 'eat', 'i', 'pizza', 'the']
Word Index: [0 2 1 4 3]
One-Hot Encoded Matrix:
 [[1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0.]]




In [None]:
# Note :

# np.array(tokens).reshape(-1, 1) will transform it into 2D array i.e:

# array([['can'],
#        ['i'],
#        ['eat'],
#        ['the'],
#        ['pizza']], dtype='<U5')

In [None]:
# Bag-of-Words with Python example

In [None]:
# Let’s look at an easy example to understand the concepts previously explained. We could be interested in analyzing the reviews about Game of Thrones:

# Review 1: Game of Thrones is an amazing tv series!

# Review 2: Game of Thrones is the best tv series!

# Review 3: Game of Thrones is so great

In [None]:
# Steps:

# Step 1: Convert text to lowercase

# step 2: CountVectorizer, for a matrix creation where each row represents a review

# Step 3: Tokenization (CountVectorizer handles tokenization)

# Step 4: Get the feature names (unique words)

# represents a review and each column represents a unique word in the corpus. The values in the matrix
# represent the count of each word in the corresponding review.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Step 1: Convert text to lowercase
reviews = [
    "Game of Thrones is an amazing tv series!",
    "Game of Thrones is the best tv series!",
    "Game of Thrones is so great"
]
reviews_lower = [review.lower() for review in reviews]

# step 2: CountVectorizer, a matrix is created where each row represents a review
#  and each column represents a unique word in the corpus
vectorizer = CountVectorizer()

# Step 3: Tokenization (CountVectorizer handles tokenization)
# Fit the vectorizer to the data and transform it
X = vectorizer.fit_transform(reviews_lower)

# Step 4: Get the feature names (unique words)
# Now that the vectorizer is fitted, you can get the features
feature_names = vectorizer.get_feature_names_out()
print("Feature Names (Unique Words):", feature_names)


print("\nBag-of-Words Matrix:")
print(X.toarray())

Feature Names (Unique Words): ['amazing' 'an' 'best' 'game' 'great' 'is' 'of' 'series' 'so' 'the'
 'thrones' 'tv']

Bag-of-Words Matrix:
[[1 1 0 1 0 1 1 1 0 0 1 1]
 [0 0 1 1 0 1 1 1 0 1 1 1]
 [0 0 0 1 1 1 1 0 1 0 1 0]]


In [None]:
# Term Frequency Inverse Document Frequency (TFIDF)

# TFIDF = TF * IDF

# TF(term) = Number of times the term appears in document / total number of terms in the document

# IDF(term) = log(total number of documents / Number of documents with term in it)

In [None]:
# Steps:
# Convert Text to Lower Case: (Handled by TfidfVectorizer)
# Remove Stop Words: Using the built-in English stop words list from scikit-learn.
# Tokenize and Vectorize: Using TfidfVectorizer to create TF-IDF vectors.
# Output Feature Names and TF-IDF Matrix: Print the feature names and the resulting TF-IDF matrix.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

sentence1 = "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
sentence2 = "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

data = [sentence1, sentence2]
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X = tfidf_vectorizer.fit_transform(data)
print(tfidf_vectorizer.get_feature_names_out())
print(X.toarray())

['08452810075over18' '2005' '21st' '87121' 'amore' 'apply' 'available'
 'buffet' 'bugis' 'cine' 'comp' 'crazy' 'cup' 'entry' 'fa' 'final' 'free'
 'got' 'great' 'jurong' 'la' 'point' 'question' 'rate' 'receive' 'std'
 'text' 'tkts' 'txt' 'wat' 'win' 'wkly' 'world']
[[0.         0.         0.         0.         0.2773501  0.
  0.2773501  0.2773501  0.2773501  0.2773501  0.         0.2773501
  0.         0.         0.         0.         0.         0.2773501
  0.2773501  0.2773501  0.2773501  0.2773501  0.         0.
  0.         0.         0.         0.         0.         0.2773501
  0.         0.         0.2773501 ]
 [0.19611614 0.19611614 0.19611614 0.19611614 0.         0.19611614
  0.         0.         0.         0.         0.19611614 0.
  0.19611614 0.39223227 0.39223227 0.19611614 0.19611614 0.
  0.         0.         0.         0.         0.19611614 0.19611614
  0.19611614 0.19611614 0.19611614 0.19611614 0.19611614 0.
  0.19611614 0.19611614 0.        ]]


In [None]:
# 0.3193851 for 'available': The word "available" appears in sentence1 and has a certain importance, as indicated by its TF-IDF score.
# 0.0 for '2005': The word "2005" does not appear in sentence1, hence its TF-IDF score is 0.