In [1]:
import numpy as np

# Example 2.2: a toy snippet from Shakespeare
docs = ['Anthony and Cleopatra Brutus Caesar romans worser',
        'Caesar Anthony Brutus Calpurnia romans',
        'The worser',
        'Brutus Caesar romans worser',
        'Caesar worser',
        'Anthony Caesar'
       ]

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=True, stop_words='english')
B = cv.fit_transform(docs).toarray() # Binary t-t-d matrix
B

array([[1, 1, 1, 0, 1, 1, 1],
       [1, 1, 1, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 1],
       [0, 1, 1, 0, 0, 1, 1],
       [0, 0, 1, 0, 0, 0, 1],
       [1, 0, 1, 0, 0, 0, 0]])

In [3]:
cv.get_feature_names_out()

array(['anthony', 'brutus', 'caesar', 'calpurnia', 'cleopatra', 'romans',
       'worser'], dtype=object)

In [4]:
B[:,2] & B[:,5] # Boolean retrieval: which documents contain 2nd and 5th terms?

array([1, 1, 0, 1, 0, 0])

In [5]:
cv = CountVectorizer(stop_words='english')
F = cv.fit_transform(docs).toarray() # TF t-t-d matrix
F

array([[1, 1, 1, 0, 1, 1, 1],
       [1, 1, 1, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 1],
       [0, 1, 1, 0, 0, 1, 1],
       [0, 0, 1, 0, 0, 0, 1],
       [1, 0, 1, 0, 0, 0, 0]])

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(stop_words='english')
F = cv.fit_transform(docs).toarray() # TF-IDFa1 t-t-d matrix
F

array([[0.39626596, 0.39626596, 0.2932457 , 0.        , 0.57238026,
        0.39626596, 0.33956982],
       [0.4212992 , 0.4212992 , 0.31177086, 0.60853913, 0.        ,
        0.4212992 , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 1.        ],
       [0.        , 0.5519934 , 0.40848751, 0.        , 0.        ,
        0.5519934 , 0.47301641],
       [0.        , 0.        , 0.65359543, 0.        , 0.        ,
        0.        , 0.75684411],
       [0.80383327, 0.        , 0.59485466, 0.        , 0.        ,
        0.        , 0.        ]])

In [7]:
q = cv.transform(['romans caesar']).toarray() # Encode a query in the same t-t-d model
np.linalg.norm(q)

1.0

In [8]:
# Load a numpy compressed data file with a text of some Bath webpages
file = np.load('BathPages.npz')
titles = file["titles"]
docs = file["docs"]

cv = TfidfVectorizer(stop_words='english')
F = cv.fit_transform(docs).toarray()

q = cv.transform(['Food']).toarray()

i = np.argmax(F @ q.T) # Most relevant document of the max cos similarity = min angle to the query
print(titles[i])
print(docs[i])

Fountain Canteen
Skip to main content

  * Courses
  * Research
  * Enterprise
  * Sport
  * Departments
  * About

  * _From_ Food and Drink 

# Fountain Canteen

A food court below Parade serving fish and chips, Asian food, street food,
value meals and more.

  * Food and drink
  * View more locations in Food and Drink 

Seating in Fountain Canteen

# Address

2 West University of Bath Claverton Down Bath BA2 7AY United Kingdom

Find your way here using Google Maps

# GPS coordinates

Latitude 51.37946  
Longitude -2.32839

# Map for this location

  
View Larger Map

# Additional information

Everyone, including visitors, can use Fountain Canteen.

Fountain Canteen has seating for 350 people.

If you are a student you can pay here using your Eat and Drink credit.

There are two microwaves located next to the Value Corner, which you can use
to heat your own food.

## Opening times

View current opening times here.

## Food and drink

In Fountain Canteen, there's a Fish & Chip shop, H