<a href="https://colab.research.google.com/github/febincf-mle/natural-language-processing/blob/main/NLP_02_text_representation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install required Packages

In [None]:
!pip install gensim --quiet
!pip install --force-reinstall numpy==1.26.4

Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
Successfully installed numpy-1.26.4


In [None]:
import numpy as np
np.__version__

'1.26.4'

## Import necessary Libraries

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
import gensim.downloader as api

## Bag Of Words (binary)

In [None]:
corpus = ["I love NLP so much that, it feels like it is better than CV", "NLP is amazing", "NLP is very much a trending topic nowadays"]
vectorizer = CountVectorizer(binary=True)

X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names_out())
print("")
print(X.toarray())

['amazing' 'better' 'cv' 'feels' 'is' 'it' 'like' 'love' 'much' 'nlp'
 'nowadays' 'so' 'than' 'that' 'topic' 'trending' 'very']

[[0 1 1 1 1 1 1 1 1 1 0 1 1 1 0 0 0]
 [1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 1 1 1 0 0 0 1 1 1]]


## Bag of Words (Frequecy)

In [None]:
corpus = ["I love NLP so much that, it feels like it is better than CV", "NLP is amazing", "NLP is very much a trending topic nowadays"]
vectorizer = CountVectorizer(binary=False)

X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names_out())
print("")
print(X.toarray())

['amazing' 'better' 'cv' 'feels' 'is' 'it' 'like' 'love' 'much' 'nlp'
 'nowadays' 'so' 'than' 'that' 'topic' 'trending' 'very']

[[0 1 1 1 1 2 1 1 1 1 0 1 1 1 0 0 0]
 [1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 1 1 1 0 0 0 1 1 1]]


## Bag of Words (ngram)

In [None]:
corpus = ["I love NLP so much that, it feels like it is better than CV", "NLP is amazing", "NLP is very much a trending topic nowadays"]
vectorizer = CountVectorizer(binary=False, ngram_range=(2, 2))

X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names_out())
print("")
print(X.toarray())

['better than' 'feels like' 'is amazing' 'is better' 'is very' 'it feels'
 'it is' 'like it' 'love nlp' 'much that' 'much trending' 'nlp is'
 'nlp so' 'so much' 'than cv' 'that it' 'topic nowadays' 'trending topic'
 'very much']

[[1 1 0 1 0 1 1 1 1 1 0 0 1 1 1 1 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 1 1 1]]


## TF-IDF Vectorization

In [None]:
corpus = ["I love NLP so much that, it feels like it is better than CV", "NLP is amazing", "NLP is very much a trending topic nowadays"]
vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names_out())
print("")
print(X.toarray())

['amazing' 'better' 'cv' 'feels' 'is' 'it' 'like' 'love' 'much' 'nlp'
 'nowadays' 'so' 'than' 'that' 'topic' 'trending' 'very']

[[0.         0.27445143 0.27445143 0.27445143 0.16209543 0.54890285
  0.27445143 0.27445143 0.20872738 0.16209543 0.         0.27445143
  0.27445143 0.27445143 0.         0.         0.        ]
 [0.76749457 0.         0.         0.         0.45329466 0.
  0.         0.         0.         0.45329466 0.         0.
  0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.25712876 0.
  0.         0.         0.3311001  0.25712876 0.43535684 0.
  0.         0.         0.43535684 0.43535684 0.43535684]]


## TF-IDF Vectorization (ngram)

In [None]:
corpus = ["I love NLP so much that, it feels like it is better than CV", "NLP is amazing", "NLP is very much a trending topic nowadays"]
vectorizer = TfidfVectorizer(ngram_range=(2, 2))

X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names_out())
print("")
print(X.toarray())

['better than' 'feels like' 'is amazing' 'is better' 'is very' 'it feels'
 'it is' 'like it' 'love nlp' 'much that' 'much trending' 'nlp is'
 'nlp so' 'so much' 'than cv' 'that it' 'topic nowadays' 'trending topic'
 'very much']

[[0.28867513 0.28867513 0.         0.28867513 0.         0.28867513
  0.28867513 0.28867513 0.28867513 0.28867513 0.         0.
  0.28867513 0.28867513 0.28867513 0.28867513 0.         0.
  0.        ]
 [0.         0.         0.79596054 0.         0.         0.
  0.         0.         0.         0.         0.         0.60534851
  0.         0.         0.         0.         0.         0.
  0.        ]
 [0.         0.         0.         0.         0.42339448 0.
  0.         0.         0.         0.         0.42339448 0.32200242
  0.         0.         0.         0.         0.42339448 0.42339448
  0.42339448]]


## Word2Vec (pre-trained)

In [None]:
# load the pretrained model.
model = api.load("word2vec-google-news-300")
model.wv['Ambulance']

# create a list of sentences and then average it out to represent it.
sentence = "I love NLP so much that, it feels like it is better than CV"
vectorized_sentence = [np.mean([model.wv[word] for word in sentence.split()], axis=0)]

## Word2Vec (Custom)

In [None]:
corpus = [
    "I love NLP so much that, it feels like it is better than CV".split(" "),
    "NLP is amazing", "NLP is very much a trending topic nowadays".split(" ")
    ]

model = Word2Vec(
    vector_size=20,  # Size of word vectors
    window=5,  # Context window
    min_count=1,  # Ignore words with frequency < min_count
    workers=4,  # Number of CPU cores
    sg=1  # skip gram
)


model.build_vocab(corpus)
model.train(corpus, total_examples=model.corpus_count, epochs=10)

model.wv['NLP']

array([ 0.03652146,  0.02537209,  0.03382076,  0.00384857,  0.03178381,
       -0.0170216 , -0.00467259,  0.0289336 , -0.03771108, -0.01966709,
       -0.03754148, -0.00472786,  0.04766761, -0.03665436, -0.01167728,
       -0.00967728,  0.04051415, -0.02964373,  0.00014709, -0.02382827],
      dtype=float32)