# WORD EMBEDDINGS 

## Frequency based

In [1]:
import nltk 
import numpy as np 
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package punkt to /home/verykul/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/verykul/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## 1. One Hot Encoding 

In [3]:
text = 'I love data science. Machine learning is part of data science'

tokens = word_tokenize(text)

seen = set()

vocabulary = []
for word in tokens:
    if word not in seen:
        seen.add(word)
        vocabulary.append(word)

print(vocabulary)

['I', 'love', 'data', 'science', '.', 'Machine', 'learning', 'is', 'part', 'of']


## 2. Bag of words 

In [7]:
cv = CountVectorizer(lowercase=True)

In [8]:
corpus = [
    "I am happy, I am learning NLP.",
    "I am happy",
    "I am sad, I am not learning NLP",
    "I am sad",
]
X = cv.fit_transform(corpus)
print(f"voacbulary: {cv.get_feature_names_out()}")
print()
print(f" Resulting vector space: \n{X.toarray()}")

voacbulary: ['am' 'happy' 'learning' 'nlp' 'not' 'sad']

 Resulting vector space: 
[[2 1 1 1 0 0]
 [1 1 0 0 0 0]
 [2 0 1 1 1 1]
 [1 0 0 0 0 1]]


# 3. N-grams

In [9]:
n_grams = CountVectorizer(ngram_range=(2,2), stop_words=None)

In [10]:
corpus = [
    "I am happy, I am learning NLP.",
    "I am happy",
    "I am sad, I am not learning NLP",
    "I am sad",
]

X = n_grams.fit_transform(corpus)
print(f"voacbulary: {n_grams.get_feature_names_out()}")
print()
print(f" Resulting vector space: \n{X.toarray()}")

voacbulary: ['am happy' 'am learning' 'am not' 'am sad' 'happy am' 'learning nlp'
 'not learning' 'sad am']

 Resulting vector space: 
[[1 1 0 0 1 1 0 0]
 [1 0 0 0 0 0 0 0]
 [0 0 1 1 0 1 1 1]
 [0 0 0 1 0 0 0 0]]


In [11]:
corpus = """Compared with other programming languages, Python’s class mechanism adds classes with a minimum of new syntax and semantics. It is a mixture of the class mechanisms found in C++ and Modula-3.
Python classes provide all the standard features of Object Oriented Programming:
the class inheritance mechanism allows multiple base classes, a derived class can override any methods of its base class or classes, and a method can call the method of a base class with the same name.
Objects can contain arbitrary amounts and kinds of data.
As is true for modules, classes partake of the dynamic nature of Python: they are created at runtime, and can be modified further after creation.
"""

tokens = sent_tokenize(corpus)
X = n_grams.fit_transform(tokens)
print(f"Resulting vector space:\n{X.toarray()}")

Resulting vector space:
[[1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0
  0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
  1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 1 1 0 0 0 1 0 0 1 0 0 0 0 2 1 0 1 0 1 0 1 1 1 0 0 1 1 1 1 0 1 0 0 0
  0 1 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0 1 1 1 0 0 0 0 1 0 0 1 0 1 0 1 0 1 0
  0 1 1 0 1 0 0 1 1 0 1 0 0 1 1 0 1 0 1 1 1 0 0 0 0 1]
 [0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1
  0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
  1 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 1
  1 0 0 0 0 1 0 