In [None]:
# Name: Devashish Mayur Potnis
# Class: BE-AIML

In [1]:
pip install scikit-learn gensim nltk


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample data
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Create the CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the data
X_bow = vectorizer.fit_transform(documents)

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Convert the result to a dense array
dense_array_bow = X_bow.toarray()

# Display the BoW matrix
print("Bag-of-Words Matrix:")
print(dense_array_bow)

# Display feature names
print("\nFeature Names:")
print(feature_names)


Bag-of-Words Matrix:
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]

Feature Names:
['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']


In [3]:
# Calculate Term Frequency (TF)
TF = dense_array_bow / dense_array_bow.sum(axis=1, keepdims=True)

# Display normalized count occurrence (TF)
print("\nNormalized Count Occurrence (TF):")
print(TF)



Normalized Count Occurrence (TF):
[[0.         0.2        0.2        0.2        0.         0.
  0.2        0.         0.2       ]
 [0.         0.33333333 0.         0.16666667 0.         0.16666667
  0.16666667 0.         0.16666667]
 [0.16666667 0.         0.         0.16666667 0.16666667 0.
  0.16666667 0.16666667 0.16666667]
 [0.         0.2        0.2        0.2        0.         0.
  0.2        0.         0.2       ]]


In [4]:
from sklearn.feature_extraction.text import TfidfTransformer

# Create the TF-IDF transformer
tfidf_transformer = TfidfTransformer()

# Fit and transform the TF matrix
X_tfidf = tfidf_transformer.fit_transform(X_bow)

# Convert the result to a dense array
dense_array_tfidf = X_tfidf.toarray()

# Display the TF-IDF matrix
print("\nTF-IDF Matrix:")
print(dense_array_tfidf)



TF-IDF Matrix:
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


In [5]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
# Tokenize the documents
tokenized_documents = [word_tokenize(doc.lower()) for doc in documents]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_documents, vector_size=100, window=5, min_count=1, workers=4)

# Example: Get the embedding for the word 'document'
embedding_document = word2vec_model.wv['document']
print("\nWord2Vec Embedding for 'document':")
print(embedding_document)



Word2Vec Embedding for 'document':
[-5.3761393e-04  2.3459077e-04  5.1012170e-03  9.0115219e-03
 -9.3035055e-03 -7.1186870e-03  6.4577162e-03  8.9744031e-03
 -5.0161965e-03 -3.7644049e-03  7.3809391e-03 -1.5342169e-03
 -4.5370674e-03  6.5543531e-03 -4.8609949e-03 -1.8136933e-03
  2.8776617e-03  9.8915887e-04 -8.2834894e-03 -9.4506554e-03
  7.3119737e-03  5.0714435e-03  6.7562792e-03  7.6230383e-04
  6.3530928e-03 -3.4065295e-03 -9.4848091e-04  5.7711215e-03
 -7.5222286e-03 -3.9373739e-03 -7.5092558e-03 -9.2885981e-04
  9.5392875e-03 -7.3166536e-03 -2.3360765e-03 -1.9363161e-03
  8.0779977e-03 -5.9297686e-03  4.5617318e-05 -4.7524953e-03
 -9.6023204e-03  5.0089518e-03 -8.7604597e-03 -4.3930719e-03
 -3.5214103e-05 -2.9548592e-04 -7.6621324e-03  9.6163880e-03
  4.9832016e-03  9.2352722e-03 -8.1572160e-03  4.4980138e-03
 -4.1374546e-03  8.2675234e-04  8.4996261e-03 -4.4643688e-03
  4.5164214e-03 -6.7876368e-03 -3.5471660e-03  9.3982853e-03
 -1.5784105e-03  3.2352045e-04 -4.1381051e-03 -7.

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adwai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Bag-of-Words
# Definition: Represents text as a set of unique words and their frequencies.
# Variations:
# Count Occurrence: Counts each word's occurrences.
# Normalized Count: Adjusts for document length.
# TF-IDF
# Definition: Measures term frequency in a document, adjusted by its inverse document frequency to account for common words across multiple documents.
# Components:
# TF (Term Frequency): Measures how often a term appears in a document.
# IDF (Inverse Document Frequency): Measures a term's rarity across a collection of documents.
# Word2Vec
# Definition: An embedding technique that maps words to vectors in a high-dimensional space, capturing semantic relationships.
# Models:
# CBOW (Continuous Bag-of-Words): Predicts a word based on its context.
# Skip-gram: Predicts context words based on a given word.