In [1]:

# load required modules

# natural language toolkit (https://www.nltk.org/)
import nltk  # probably have to install that first
nltk.download('stopwords') # this might take somne time
from nltk.corpus import stopwords

# some stuff from scikit-learn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# gensim for word2vec (https://radimrehurek.com/gensim/index.html)
from gensim import downloader # probably have to install that first

# some character set for punctuations
import string # punctuation set is accessed by string.punctuation

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# define some sentences
# for the exercise it is helpful to have some similar sentences in meaning
sentence1 = "I like to drink water infused with lemons."
sentence2 = "water is very good when you are thursty"
sentence3 = "Did you buy a new car today?"

### Pre-processing

In [3]:
# lowercase transformation by calling .lower()
sentence1 = sentence1.lower()
sentence2 = sentence2.lower()
sentence3 = sentence3.lower()

In [4]:
# remove stopwords (since they are not meaningful)
stopWords = set(stopwords.words('english'))
print(stopWords)

{'as', 'had', 'each', 'again', 'down', "isn't", 'him', "didn't", 'no', 'while', 'does', 'below', 'to', "shouldn't", "wouldn't", "couldn't", 'you', 'when', 'now', 'them', "needn't", 'off', 'how', 'same', 'being', 'any', 'i', 'if', "it's", 've', 'mustn', 'can', "you're", 'be', "won't", 'their', 'your', 'mightn', 'there', 'm', 'before', 'in', 'needn', 'both', "weren't", "hasn't", 'did', 'just', 'through', 'isn', 'nor', 'until', 'have', 'she', 'too', 'which', 'himself', "hadn't", 'haven', 'or', 'than', 'that', "doesn't", 'hers', 'yours', 'its', 'because', 'didn', 'theirs', 'ain', 'where', "mustn't", 'hasn', 'me', 'ours', 'more', 'for', 'from', 'only', 'under', 'doesn', 'doing', 'ourselves', 'all', 'll', 'we', 'with', 'yourself', 'is', 'few', 'here', 'a', 'having', 'are', 'so', 'd', "should've", 'herself', 'it', "you'll", 'own', "you've", 'weren', 'but', 'itself', "you'd", 'ma', 'up', 'shouldn', 'between', 'and', "don't", 'was', 'most', 'o', "aren't", "mightn't", 'on', 'do', 'wouldn', 'by',

In [5]:
# split sentences into words and special characters
# you can do this with .split() or use a tokenizer, like from nltk.tokenize import word_tokenize
# remove stop words
# remove punctuation with string.punctuation
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
sentence1 = sentence1.translate(str.maketrans('','', string.punctuation))
sentence2 = sentence2.translate(str.maketrans('','', string.punctuation))
sentence3 = sentence3.translate(str.maketrans('','', string.punctuation))

In [7]:
sentence1 = word_tokenize(sentence1)
sentence2 = word_tokenize(sentence2)
sentence3 = word_tokenize(sentence3)
print(sentence1)
print(sentence2)
print(sentence3)

['i', 'like', 'to', 'drink', 'water', 'infused', 'with', 'lemons']
['water', 'is', 'very', 'good', 'when', 'you', 'are', 'thursty']
['did', 'you', 'buy', 'a', 'new', 'car', 'today']


In [8]:
# remove stopwords (since they are not meaningful)
stopWords = set(stopwords.words('english'))
print(stopWords)

{'as', 'had', 'each', 'again', 'down', "isn't", 'him', "didn't", 'no', 'while', 'does', 'below', 'to', "shouldn't", "wouldn't", "couldn't", 'you', 'when', 'now', 'them', "needn't", 'off', 'how', 'same', 'being', 'any', 'i', 'if', "it's", 've', 'mustn', 'can', "you're", 'be', "won't", 'their', 'your', 'mightn', 'there', 'm', 'before', 'in', 'needn', 'both', "weren't", "hasn't", 'did', 'just', 'through', 'isn', 'nor', 'until', 'have', 'she', 'too', 'which', 'himself', "hadn't", 'haven', 'or', 'than', 'that', "doesn't", 'hers', 'yours', 'its', 'because', 'didn', 'theirs', 'ain', 'where', "mustn't", 'hasn', 'me', 'ours', 'more', 'for', 'from', 'only', 'under', 'doesn', 'doing', 'ourselves', 'all', 'll', 'we', 'with', 'yourself', 'is', 'few', 'here', 'a', 'having', 'are', 'so', 'd', "should've", 'herself', 'it', "you'll", 'own', "you've", 'weren', 'but', 'itself', "you'd", 'ma', 'up', 'shouldn', 'between', 'and', "don't", 'was', 'most', 'o', "aren't", "mightn't", 'on', 'do', 'wouldn', 'by',

In [9]:
sentence1 = [i for i in sentence1 if i not in stopWords]
sentence2 = [i for i in sentence2 if i not in stopWords]
sentence3 = [i for i in sentence3 if i not in stopWords]

In [10]:
# perform stemming if you like
from nltk.stem import PorterStemmer # other algorithms are available
ps = PorterStemmer()
# then call ps.stem(word)

In [11]:
# apply to all sentences

### Sentence Level Vectorization

In [12]:
# use CountVectorizer 
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer
# print vectors and words in vocabulary

In [13]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer

from nltk.tokenize.treebank import TreebankWordDetokenizer

sentence1 = TreebankWordDetokenizer().detokenize(sentence1)
sentence2 = TreebankWordDetokenizer().detokenize(sentence2)
sentence3 = TreebankWordDetokenizer().detokenize(sentence3)

print(sentence1)
print(sentence2)
print(sentence3)

like drink water infused lemons
water good thursty
buy new car today


In [14]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

coun_vect = CountVectorizer()
count_matrix = coun_vect.fit_transform([sentence1, sentence2, sentence3])
count_array = count_matrix.toarray()
df = pd.DataFrame(data=count_array,columns = coun_vect.get_feature_names_out())
df

Unnamed: 0,buy,car,drink,good,infused,lemons,like,new,thursty,today,water
0,0,0,1,0,1,1,1,0,0,0,1
1,0,0,0,1,0,0,0,0,1,0,1
2,1,1,0,0,0,0,0,1,0,1,0


In [15]:
# check the similarity between the sentences with cosine similarity or euclidean distance
# for cosine_similarity you can use cosine_similarity(), for a single sentence you need to add .reshape(1,-1) to the vectors
# vectors can be obtained with the .toarray method


In [16]:
df.iloc[0,].to_numpy()

array([0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1], dtype=int64)

In [17]:
import numpy as np

print("{1,2}:", np.sqrt(np.sum(np.square(df.iloc[0,].to_numpy() - df.iloc[1,].to_numpy()))))
print("{1,3}:", np.sqrt(np.sum(np.square(df.iloc[0,].to_numpy() - df.iloc[2,].to_numpy()))))
print("{2,3}:", np.sqrt(np.sum(np.square(df.iloc[1,].to_numpy() - df.iloc[2,].to_numpy()))))

{1,2}: 2.449489742783178
{1,3}: 3.0
{2,3}: 2.6457513110645907


In [18]:
# use ngrams
# can use CountVectorizer with option ngram_range=(xx,xx)
# check the similarity between the sentences with cosine similarity or euclidean distance

In [19]:
vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2))
X2 = vectorizer2.fit_transform([sentence1, sentence2, sentence3])
vectorizer2.get_feature_names_out()

array(['buy new', 'car today', 'drink water', 'good thursty',
       'infused lemons', 'like drink', 'new car', 'water good',
       'water infused'], dtype=object)

In [20]:
# use tf-idf
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer
# check the similarity between the sentences with cosine similarity or euclidean distance


### Word Level Vectorization

In [21]:
# list available models
out = downloader.info(name_only=True)
print(out['models'])

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [22]:
# use one of the models with downloader.load()
# and show entries with .index_to_keys[:20]

In [23]:
# use .most_similar(word, topn=5) for finding five most similar words in the model
# use .most_similar(negative=[words], topn=5) for finding five most dissimilar words in the model

In [24]:
# make word embeddings for the words in the initial sentences
