**Bag of Words Algorithm Implementation**

In [5]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-win_amd64.whl.metadata (8.2 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Using cached numpy-1.26.4-cp312-cp312-win_amd64.whl.metadata (61 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Using cached scipy-1.13.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.1.0-py3-none-any.whl.metadata (24 kB)
Collecting wrapt (from smart-open>=1.8.1->gensim)
  Downloading wrapt-1.17.2-cp312-cp312-win_amd64.whl.metadata (6.5 kB)
Downloading gensim-4.3.3-cp312-cp312-win_amd64.whl (24.0 MB)
   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/24.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/24.0 MB ? eta -:--:--
    -----------------------------------

In [6]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import math
from gensim.models import Word2Vec

nltk.download('punkt')
nltk.download('stopwords')

# Sample documents
docs = [
    "I love playing football",
    "Football is a great sport",
    "I love watching movies"
]

# Preprocessing and Tokenization
stop_words = set(stopwords.words('english'))
tokenized_docs = []
for doc in docs:
    words = word_tokenize(doc.lower())
    filtered = [w for w in words if w.isalpha() and w not in stop_words]
    tokenized_docs.append(filtered)

# 1. Bag of Words (Count)
all_words = [word for doc in tokenized_docs for word in doc]
word_counts = Counter(all_words)
print("BoW (Count):", word_counts)

# 2. Normalized Count (TF)
total_words = sum(word_counts.values())
normalized_bow = {word: count/total_words for word, count in word_counts.items()}
print("Normalized BoW:", normalized_bow)

# 3. TF-IDF
def compute_tf(doc):
    tf = Counter(doc)
    total = len(doc)
    return {word: freq/total for word, freq in tf.items()}

def compute_idf(docs):
    N = len(docs)
    idf = {}
    all_words = set(word for doc in docs for word in doc)
    for word in all_words:
        containing = sum(1 for doc in docs if word in doc)
        idf[word] = math.log(N / (1 + containing))
    return idf

def compute_tfidf(doc, idf):
    tf = compute_tf(doc)
    return {word: tf[word] * idf[word] for word in doc}

idf = compute_idf(tokenized_docs)
for i, doc in enumerate(tokenized_docs):
    tfidf = compute_tfidf(doc, idf)
    print(f"TF-IDF for doc {i+1}:", tfidf)

# 4. Word2Vec using gensim
model = Word2Vec(sentences=tokenized_docs, vector_size=50, window=2, min_count=1)
print("Word2Vec vector for 'football':", model.wv['football'])


BoW (Count): Counter({'love': 2, 'football': 2, 'playing': 1, 'great': 1, 'sport': 1, 'watching': 1, 'movies': 1})
Normalized BoW: {'love': 0.2222222222222222, 'playing': 0.1111111111111111, 'football': 0.2222222222222222, 'great': 0.1111111111111111, 'sport': 0.1111111111111111, 'watching': 0.1111111111111111, 'movies': 0.1111111111111111}
TF-IDF for doc 1: {'love': 0.0, 'playing': 0.13515503603605478, 'football': 0.0}
TF-IDF for doc 2: {'football': 0.0, 'great': 0.13515503603605478, 'sport': 0.13515503603605478}
TF-IDF for doc 3: {'love': 0.0, 'watching': 0.13515503603605478, 'movies': 0.13515503603605478}
Word2Vec vector for 'football': [-1.0724545e-03  4.7286271e-04  1.0206699e-02  1.8018546e-02
 -1.8605899e-02 -1.4233618e-02  1.2917745e-02  1.7945977e-02
 -1.0030856e-02 -7.5267432e-03  1.4761009e-02 -3.0669428e-03
 -9.0732267e-03  1.3108104e-02 -9.7203208e-03 -3.6320353e-03
  5.7531595e-03  1.9837476e-03 -1.6570430e-02 -1.8897636e-02
  1.4623532e-02  1.0140524e-02  1.3515387e-02  

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Gauri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gauri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
