In [3]:
#!pip install wikipedia

In [4]:
import wikipedia

In [5]:
q1 = wikipedia.page('Deep Learning')
q2 = wikipedia.page('Artifical Intelligence')
q3 = wikipedia.page('Analytics')
q4 = wikipedia.page('Baseball')

In [6]:
print("ML \t",     len(q1.content.split()), "\n"
      "AI \t",     len(q2.content.split()), "\n"
      "soccer \t", len(q3.content.split()), "\n"
      "tennis \t", len(q4.content.split()))

ML 	 8121 
AI 	 7795 
soccer 	 2302 
tennis 	 9526


- `AI` is a much larger article than `Deep Learning` (DL). 

- This would mean that if we do not normalize our vectors, `AI` will be `much further away` from DL just because it has `many` more words. 

- DL will probably be closer to an article with less words. (simlar number of words)

In [8]:
import numpy as np
import pandas as pd

from nltk import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
import re

from scipy.spatial import distance
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# import plotting libraries
from mpl_toolkits.mplot3d.axes3d import Axes3D

import matplotlib
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
%matplotlib inline 

import seaborn as sns
sns.set(style="white", color_codes=True)
sns.set(font_scale=1.5)

from sklearn.preprocessing import normalize

In [11]:
cv = CountVectorizer()

In [13]:
cv = cv.fit(([q1.content, 
              q2.content, 
              q3.content, 
              q4.content]))

len(cv.get_feature_names_out())

5118

In [14]:
X = cv.transform([q1.content, 
                           q2.content, 
                           q3.content, 
                           q4.content]).toarray()

In [15]:
X

array([[5, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 1, ..., 0, 1, 0]], dtype=int64)

In [17]:
def euclidean_distance(x, y):   
    return np.sqrt(np.sum((x - y) ** 2))

In [18]:
print("ML - AI \t",     euclidean_distance(X[0], X[1]), "\n"
      "ML - soccer \t", euclidean_distance(X[0], X[2]), "\n"
      "ML - tennis \t", euclidean_distance(X[0], X[3]))

ML - AI 	 310.0209670328767 
ML - soccer 	 513.8550379241211 
ML - tennis 	 610.041801846398


ML seems to be closest to soccer, which doesn’t make a lot of sense intuitively. 

So, what happens if we look at cosine similairty (thus normalising our vectors)?

In [20]:
def fn_cosine_similarity(x, y):
    return np.dot(x, y) / (np.sqrt(np.dot(x, x)) * np.sqrt(np.dot(y, y)))

In [21]:
print("ML - AI \t",     fn_cosine_similarity(X[0], X[1]), "\n"
      "ML - soccer \t", fn_cosine_similarity(X[0], X[2]), "\n"
      "ML - tennis \t", fn_cosine_similarity(X[0], X[3]))

ML - AI 	 0.8955153014676447 
ML - soccer 	 0.7684475886416761 
ML - tennis 	 0.8063324463559948


ML is closer to AI! Granted, it still seems pretty close to soccer an tennis judging from these scores, but please note that word frequency is not that great of a representation for texts with such rich content.