### Assignment 2


### Perform bag-of-words approach (count occurrence, normalized count occurrence), TF-IDF on data. Create embeddings using Word2Vec


In [1]:
from sklearn.feature_extraction.text import CountVectorizer

documents = [
    "I love machine learning",
    "Machine learning is fun",
    "I love deep learning"
]

vectorizer = CountVectorizer(lowercase=True)
bow_matrix = vectorizer.fit_transform(documents)

print("Vocabulary:", vectorizer.get_feature_names_out())
print("BoW Matrix:\n", bow_matrix.toarray())

Vocabulary: ['deep' 'fun' 'is' 'learning' 'love' 'machine']
BoW Matrix:
 [[0 0 0 1 1 1]
 [0 1 1 1 0 1]
 [1 0 0 1 1 0]]


In [2]:
import numpy as np

bow_array = bow_matrix.toarray()
normalized_bow = bow_array / bow_array.sum(axis=1, keepdims=True)

print("Normalized BoW:\n", normalized_bow)

Normalized BoW:
 [[0.         0.         0.         0.33333333 0.33333333 0.33333333]
 [0.         0.25       0.25       0.25       0.         0.25      ]
 [0.33333333 0.         0.         0.33333333 0.33333333 0.        ]]


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(lowercase=True)
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

print("Vocabulary:", tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Matrix:\n", tfidf_matrix.toarray())

Vocabulary: ['deep' 'fun' 'is' 'learning' 'love' 'machine']
TF-IDF Matrix:
 [[0.         0.         0.         0.48133417 0.61980538 0.61980538]
 [0.         0.5844829  0.5844829  0.34520502 0.         0.44451431]
 [0.72033345 0.         0.         0.42544054 0.54783215 0.        ]]


In [5]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp313-cp313-win_amd64.whl.metadata (8.6 kB)
Downloading gensim-4.4.0-cp313-cp313-win_amd64.whl (24.4 MB)
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------- ----------------------------- 6.3/24.4 MB 31.8 MB/s eta 0:00:01
   --------------------- ------------------ 13.1/24.4 MB 31.4 MB/s eta 0:00:01
   -------------------------------- ------- 19.7/24.4 MB 31.9 MB/s eta 0:00:01
   -------------------------------------- - 23.6/24.4 MB 29.4 MB/s eta 0:00:01
   ---------------------------------------- 24.4/24.4 MB 27.4 MB/s  0:00:00
Installing collected packages: gensim
Successfully installed gensim-4.4.0


In [6]:
from gensim.models import Word2Vec

# Tokenize sentences
tokenized_docs = [doc.lower().split() for doc in documents]

# Train Word2Vec
w2v_model = Word2Vec(
    sentences=tokenized_docs,
    vector_size=50,
    window=2,
    min_count=1,
    workers=4
)

# Get embedding for a word
print("Embedding for 'learning':\n", w2v_model.wv['learning'])

# Similar words
print("Similar to 'learning':\n", w2v_model.wv.most_similar('learning'))

Embedding for 'learning':
 [-1.0724545e-03  4.7286271e-04  1.0206699e-02  1.8018546e-02
 -1.8605899e-02 -1.4233618e-02  1.2917745e-02  1.7945977e-02
 -1.0030856e-02 -7.5267432e-03  1.4761009e-02 -3.0669428e-03
 -9.0732267e-03  1.3108104e-02 -9.7203208e-03 -3.6320353e-03
  5.7531595e-03  1.9837476e-03 -1.6570430e-02 -1.8897636e-02
  1.4623532e-02  1.0140524e-02  1.3515387e-02  1.5257311e-03
  1.2701781e-02 -6.8107317e-03 -1.8928028e-03  1.1537147e-02
 -1.5043275e-02 -7.8722071e-03 -1.5023164e-02 -1.8600845e-03
  1.9076237e-02 -1.4638334e-02 -4.6675373e-03 -3.8754821e-03
  1.6154874e-02 -1.1861792e-02  9.0324880e-05 -9.5074680e-03
 -1.9207101e-02  1.0014586e-02 -1.7519170e-02 -8.7836506e-03
 -7.0199967e-05 -5.9236289e-04 -1.5322480e-02  1.9229487e-02
  9.9641159e-03  1.8466286e-02]
Similar to 'learning':
 [('love', 0.1267007291316986), ('machine', 0.042373016476631165), ('fun', 0.012442173436284065), ('i', -0.01447527389973402), ('is', -0.05974648892879486), ('deep', -0.11821285635232925

In [7]:
import numpy as np

def document_embedding(doc, model):
    vectors = [model.wv[word] for word in doc if word in model.wv]
    return np.mean(vectors, axis=0)

doc_vector = document_embedding(tokenized_docs[0], w2v_model)
print("Document Vector Shape:", doc_vector.shape)

Document Vector Shape: (50,)
