In [None]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m70.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
import pandas as pd

corpus = [
    "The cat sat on the mat the What what Sat Dog DOG DOG Dog Dog",
    "The dog sat on the log",
    "The cat and the dog are friends"
]

In [None]:
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(corpus)

In [None]:
# Convert to DataFrame for better visualization
df_bow = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())
print("Bag of Words (Raw Counts):")
print(df_bow)

Bag of Words (Raw Counts):
   and  are  cat  dog  friends  log  mat  on  sat  the  what
0    0    0    1    5        0    0    1   1    2    3     2
1    0    0    0    1        0    1    0   1    1    2     0
2    1    1    1    1        1    0    0   0    0    2     0


In [None]:
# L1 Normalization: Each row sums to 1 (Relative Frequency)
vectorizer_norm = CountVectorizer()
counts = vectorizer_norm.fit_transform(corpus).toarray()
# Manual normalization to show logic: row / sum of row
normalized_counts = counts / counts.sum(axis=1)[:, None]

In [None]:

df_norm = pd.DataFrame(normalized_counts, columns=vectorizer_norm.get_feature_names_out())
print("\nNormalized Counts (Relative Frequency):")
print(df_norm.round(2))


Normalized Counts (Relative Frequency):
    and   are   cat   dog  friends   log   mat    on   sat   the  what
0  0.00  0.00  0.07  0.33     0.00  0.00  0.07  0.07  0.13  0.20  0.13
1  0.00  0.00  0.00  0.17     0.00  0.17  0.00  0.17  0.17  0.33  0.00
2  0.14  0.14  0.14  0.14     0.14  0.00  0.00  0.00  0.00  0.29  0.00


In [None]:
tfidf_vec = TfidfVectorizer()
# Fit and transform
tfidf_matrix = tfidf_vec.fit_transform(corpus)

In [None]:
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vec.get_feature_names_out())
print("\nTF-IDF Matrix:")
print(df_tfidf.round(3))


TF-IDF Matrix:
     and    are    cat    dog  friends    log    mat     on    sat    the  \
0  0.000  0.000  0.169  0.655    0.000  0.000  0.222  0.169  0.337  0.393   
1  0.000  0.000  0.000  0.299    0.000  0.506  0.000  0.385  0.385  0.598   
2  0.433  0.433  0.330  0.256    0.433  0.000  0.000  0.000  0.000  0.512   

    what  
0  0.444  
1  0.000  
2  0.000  


In [None]:
tokenized_corpus = [doc.lower().split() for doc in corpus]

In [None]:
w2v_model = Word2Vec(sentences=tokenized_corpus, vector_size=10, window=5, min_count=1, workers=4)

In [None]:
word_vector = w2v_model.wv['dog']

print("\nWord2Vec Vector for 'cat':")
print(word_vector)

# Find most similar words
print("\nWords similar to 'cat':")
print(w2v_model.wv.most_similar('cat'))


Word2Vec Vector for 'cat':
[-0.00536227  0.00236431  0.0510335   0.09009273 -0.0930295  -0.07116809
  0.06458873  0.08972988 -0.05015428 -0.03763372]

Words similar to 'cat':
[('log', 0.29140493273735046), ('and', 0.27556222677230835), ('on', 0.24952401220798492), ('mat', 0.22384946048259735), ('what', 0.09267307072877884), ('are', -0.02181689254939556), ('friends', -0.042645372450351715), ('sat', -0.1517048180103302), ('dog', -0.2726021111011505), ('the', -0.3820516765117645)]


In [None]:
word_vector = w2v_model.wv['on']

print("\nWord2Vec Vector for 'mat':")
print(word_vector)

# Find most similar words
print("\nWords similar to 'mat':")
print(w2v_model.wv.most_similar('mat'))


Word2Vec Vector for 'mat':
[ 0.01631476  0.00189917  0.03473637  0.00217777  0.09618826  0.05060603
 -0.0891739  -0.0704156   0.00901456  0.06392534]

Words similar to 'mat':
[('friends', 0.2914133369922638), ('dog', 0.05541810393333435), ('cat', 0.042647670954465866), ('log', -0.021763404831290245), ('are', -0.1986347883939743), ('and', -0.21799445152282715), ('on', -0.2315719574689865), ('sat', -0.5145737528800964), ('the', -0.7287455797195435)]
