In [5]:
import requests
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize

### Constructing Bag of Words

In [4]:
cv = CountVectorizer()
texts =["Hello, this is a python course", "Hi, my bike broke this tuesday"]

X = cv.fit_transform(texts)

df = pd.DataFrame(X.toarray(), columns=cv.get_feature_names_out())

df

Unnamed: 0,bike,broke,course,hello,hi,is,my,python,this,tuesday
0,0,0,1,1,0,1,0,1,1,0
1,1,1,0,0,1,0,1,0,1,1


### Constructing Bag of Words with stopwords and tokenizers

In [7]:
cv = CountVectorizer(stop_words='english', tokenizer=word_tokenize)
texts =["Hello, this is a python course", "Hi, my bike broke this tuesday"]

X = cv.fit_transform(texts)

df = pd.DataFrame(X.toarray(), columns=cv.get_feature_names_out())

df



Unnamed: 0,",",bike,broke,course,hello,hi,python,tuesday
0,1,0,0,1,1,0,1,0
1,1,1,1,0,0,1,0,1


### Computing Jaccard similarity between two different texts

In [11]:
path = "/Users/eduardo/Downloads/all_ECB_speeches.csv"

df = pd.read_csv(path, sep="|", ).dropna()

text = df['contents'].tolist()

X = cv.fit_transform(text)



#### Cosine similarity by hand

In [13]:
arr1 = X[0].toarray().flatten()
arr2 = X[1].toarray().flatten()

numerator = np.sum(arr1 * arr2)
denominator = np.sqrt(np.sum(arr1 ** 2)) * np.sqrt(np.sum(arr2 ** 2))
print(numerator / denominator)

#### Cosine similarity using `cosine_similarity`package

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

print(cosine_similarity(X[0].toarray(), X[1].toarray()))

0.8221334445162073
[[0.82213344]]


  (0, 81111)	4
  (0, 73955)	1
  (0, 43727)	2
  (0, 25938)	3
  (0, 67500)	1
  (0, 58064)	1
  (0, 39918)	4
  (0, 298)	152
  (0, 32334)	41
  (0, 43333)	3
  (0, 95985)	28
  (0, 63298)	3
  (0, 5231)	8
  (0, 67293)	1
  (0, 583)	113
  (0, 55137)	3
  (0, 85131)	1
  (0, 43933)	2
  (0, 79785)	1
  (0, 65684)	4
  (0, 9696)	10
  (0, 19972)	1
  (0, 76665)	15
  (0, 31067)	1
  (0, 21402)	1
  :	:
  (0, 19114)	2
  (0, 57180)	1
  (0, 75739)	1
  (0, 31147)	1
  (0, 76699)	1
  (0, 79704)	1
  (0, 8807)	1
  (0, 12773)	1
  (0, 48173)	1
  (0, 6200)	1
  (0, 79374)	1
  (0, 40010)	1
  (0, 81079)	1
  (0, 75789)	1
  (0, 47178)	1
  (0, 28748)	1
  (0, 29861)	1
  (0, 85856)	1
  (0, 45548)	1
  (0, 67269)	1
  (0, 81115)	1
  (0, 74775)	1
  (0, 13873)	1
  (0, 74118)	1
  (0, 56975)	1
