In [2]:
!pip install gensim
import pandas as pd
import nltk
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Download tokenizer for Word2Vec preprocessing
nltk.download('punkt')

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m97.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
corpus = [
    "I love machine learning and deep learning",
    "I love coding in python",
    "Machine learning is fun and exciting"
]

print("--- INPUT CORPUS ---")
for i, doc in enumerate(corpus):
    print(f"Doc {i+1}: {doc}")
print("\n" + "="*50 + "\n")

--- INPUT CORPUS ---
Doc 1: I love machine learning and deep learning
Doc 2: I love coding in python
Doc 3: Machine learning is fun and exciting




In [4]:
print("--- 1. BAG OF WORDS (BoW) ---")

# A. Count Occurrence (Raw Counts)
# CountVectorizer converts text to a matrix of token counts
count_vect = CountVectorizer()
bow_matrix = count_vect.fit_transform(corpus)

# Convert to DataFrame for better visualization
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=count_vect.get_feature_names_out())
print("A. Raw Count Occurrence Matrix:")
print(bow_df)
print("\n")

# B. Normalized Count Occurrence (Term Frequency)
# Normalization (L1) ensures that the sum of the row is 1 (representing probabilities/frequencies)
# This accounts for document length differences.
norm_count_vect = CountVectorizer()
bow_matrix_raw = norm_count_vect.fit_transform(corpus).toarray()

# Manually normalize: divide each count by the total words in that document (row sum)
# axis=1 means sum across columns (per row)
# [:, np.newaxis] allows division of the matrix by the column vector
normalized_matrix = bow_matrix_raw / bow_matrix_raw.sum(axis=1)[:, np.newaxis]

norm_bow_df = pd.DataFrame(normalized_matrix, columns=norm_count_vect.get_feature_names_out())
print("B. Normalized Count Matrix (Term Frequency):")
print(norm_bow_df.round(2)) # Rounded for readability
print("\n" + "="*50 + "\n")

--- 1. BAG OF WORDS (BoW) ---
A. Raw Count Occurrence Matrix:
   and  coding  deep  exciting  fun  in  is  learning  love  machine  python
0    1       0     1         0    0   0   0         2     1        1       0
1    0       1     0         0    0   1   0         0     1        0       1
2    1       0     0         1    1   0   1         1     0        1       0


B. Normalized Count Matrix (Term Frequency):
    and  coding  deep  exciting   fun    in    is  learning  love  machine  \
0  0.17    0.00  0.17      0.00  0.00  0.00  0.00      0.33  0.17     0.17   
1  0.00    0.25  0.00      0.00  0.00  0.25  0.00      0.00  0.25     0.00   
2  0.17    0.00  0.00      0.17  0.17  0.00  0.17      0.17  0.00     0.17   

   python  
0    0.00  
1    0.25  
2    0.00  




In [5]:
print("--- 2. TF-IDF ---")

# TfidfVectorizer converts text to a matrix of TF-IDF features
tfidf_vect = TfidfVectorizer()
tfidf_matrix = tfidf_vect.fit_transform(corpus)

# Convert to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vect.get_feature_names_out())

print("TF-IDF Matrix:")
print(tfidf_df.round(2))
print("\n" + "="*50 + "\n")

--- 2. TF-IDF ---
TF-IDF Matrix:
    and  coding  deep  exciting   fun    in    is  learning  love  machine  \
0  0.34    0.00  0.45      0.00  0.00  0.00  0.00      0.68  0.34     0.34   
1  0.00    0.53  0.00      0.00  0.00  0.53  0.00      0.00  0.40     0.00   
2  0.35    0.00  0.00      0.46  0.46  0.00  0.46      0.35  0.00     0.35   

   python  
0    0.00  
1    0.53  
2    0.00  




In [7]:
import nltk
nltk.download('punkt_tab')

print("--- 3. WORD2VEC ---")

# Word2Vec requires a list of tokenized sentences (list of lists of words)
tokenized_corpus = [word_tokenize(doc.lower()) for doc in corpus]
print(f"Tokenized Input for Word2Vec: {tokenized_corpus}\n")

# Train the Word2Vec model
# vector_size=10: Creates a vector of 10 dimensions for each word (usually 100-300 in real use)
# window=5: Context window size
# min_count=1: Ignores words that appear less than 1 time
# workers=4: Number of threads
model = Word2Vec(sentences=tokenized_corpus, vector_size=10, window=5, min_count=1, workers=4)

# Access vector for a specific word (e.g., 'learning')
word_to_check = 'learning'
vector = model.wv[word_to_check]

print(f"Vector for word '{word_to_check}' (Size 10):")
print(vector)

# Find most similar words (based on cosine similarity)
# Note: With such a tiny dataset, similarity results won't be very meaningful yet
print(f"\nMost similar words to '{word_to_check}':")
print(model.wv.most_similar(word_to_check))

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


--- 3. WORD2VEC ---
Tokenized Input for Word2Vec: [['i', 'love', 'machine', 'learning', 'and', 'deep', 'learning'], ['i', 'love', 'coding', 'in', 'python'], ['machine', 'learning', 'is', 'fun', 'and', 'exciting']]

Vector for word 'learning' (Size 10):
[-0.00536227  0.00236431  0.0510335   0.09009273 -0.0930295  -0.07116809
  0.06458873  0.08972988 -0.05015428 -0.03763372]

Most similar words to 'learning':
[('and', 0.5436005592346191), ('coding', 0.43182477355003357), ('python', 0.37924280762672424), ('machine', 0.3004249036312103), ('deep', 0.22743143141269684), ('love', 0.10494352877140045), ('is', -0.1311161071062088), ('fun', -0.18982969224452972), ('i', -0.22418655455112457), ('exciting', -0.2726021111011505)]
