# Text representation

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

## Dummy vocabulary

In [11]:
processed_docs = [
                  "Dog bites man", 
                  "Man bites dog", 
                  "Dog eats meat.", 
                  "Man eats food"
                  ]

## One hot encoding

## Bag of Words (BoW)

In [12]:
bow_rep = count_vect.fit_transform(processed_docs)
print(count_vect.vocabulary_)
print(bow_rep[0].toarray())
temp = count_vect.transform(["dog and dog are friends"])
print("Bow representation for 'dog and dog are friends':",temp.toarray())

{'dog': 1, 'bites': 0, 'man': 4, 'eats': 2, 'meat': 5, 'food': 3}
[[1 1 0 0 1 0]]
Bow representation for 'dog and dog are friends': [[0 2 0 0 0 0]]


## Bag of N-gram (BoN)

In [13]:
#n-gram vectorization example with count vectorizer and uni, bi, trigrams
count_vect = CountVectorizer(ngram_range=(1,3))

#Build a BOW representation for the corpus
bow_rep = count_vect.fit_transform(processed_docs)

#Look at the vocabulary mapping
print("Our vocabulary: ", count_vect.vocabulary_)

#Get the representation using this vocabulary, for a new text
temp = count_vect.transform(["dog and dog are friends"])
print("Bow representation for 'dog and dog are friends':", temp.toarray())

Our vocabulary:  {'dog': 3, 'bites': 0, 'man': 12, 'dog bites': 4, 'bites man': 2, 'dog bites man': 5, 'man bites': 13, 'bites dog': 1, 'man bites dog': 14, 'eats': 8, 'meat': 17, 'dog eats': 6, 'eats meat': 10, 'dog eats meat': 7, 'food': 11, 'man eats': 15, 'eats food': 9, 'man eats food': 16}
Bow representation for 'dog and dog are friends': [[0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


## TF-IDF


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
bow_rep_tfidf = tfidf.fit_transform(processed_docs)
print(tfidf.idf_) #IDF for all words in the vocabulary
print(tfidf.get_feature_names_out()) #All words in the vocabulary.

temp = tfidf.transform(["dog and man are friends"])
print("Tfidf representation for 'dog and man are friends':\n", temp.toarray())

[1.51082562 1.22314355 1.51082562 1.91629073 1.22314355 1.91629073]
['bites' 'dog' 'eats' 'food' 'man' 'meat']
Tfidf representation for 'dog and man are friends':
 [[0.         0.70710678 0.         0.         0.70710678 0.        ]]


# Advanced text embeddings

## Pre-trained word embeddings

In [1]:
import gensim.downloader as api
from gensim.models import KeyedVectors

# The name of the smaller pre-trained model to download.
# This model is a 50-dimensional GloVe model trained on Wikipedia.
# It's much, much faster to download and perfect for testing.
model_name = "glove-wiki-gigaword-50"
print(f"Downloading pre-trained model: {model_name}...")
wv = api.load(model_name)
print("Download complete!")

# -----------------------------------------------------------
# Using the KeyedVectors model
# -----------------------------------------------------------

# 1. Find words most similar to a given word
# The `most_similar` method returns a list of tuples.
print("\nWords most similar to 'car':")
similar_words = wv.most_similar('car', topn=5)
for word, score in similar_words:
    print(f"  {word}: {score:.4f}")

# 2. Find the word that doesn't belong
print("\nWord that doesn't match in the list 'breakfast cereal dinner lunch':")
odd_one_out = wv.doesnt_match("breakfast cereal dinner lunch".split())
print(f"  The odd one out is: {odd_one_out}")

# 3. Solve a word analogy
print("\nAnalogy: King - Man + Woman = ?")
analogy_result = wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
for word, score in analogy_result:
    print(f"  The result is: {word} (score: {score:.4f})")

Downloading pre-trained model: glove-wiki-gigaword-50...
Download complete!

Words most similar to 'car':
  truck: 0.9209
  cars: 0.8870
  vehicle: 0.8834
  driver: 0.8464
  driving: 0.8384

Word that doesn't match in the list 'breakfast cereal dinner lunch':
  The odd one out is: cereal

Analogy: King - Man + Woman = ?
  The result is: queen (score: 0.8524)
