In [52]:
import numpy as np

from vector_store import (
    VectorStore,
)

vs = VectorStore()

In [53]:
# "documents"
sentences = [
    "The Florida State Seminoles are having a terrible season",
    "The Florida Gators football program represents the University of Florida",
    "The Miami hurricanes are leading the ACC",
    "Florida is home to eighteen college football teams.",
]

In [54]:
# simple (pretty bad) tokenization
vocabulary = set()
for sentence in sentences:
    tokens = sentence.lower().split()
    vocabulary.update(tokens)

word_to_index = {word: i for i, word in enumerate(sorted(vocabulary))}

In [55]:
len(vocabulary)

25

In [56]:
word_to_index

{'a': 0,
 'acc': 1,
 'are': 2,
 'college': 3,
 'eighteen': 4,
 'florida': 5,
 'football': 6,
 'gators': 7,
 'having': 8,
 'home': 9,
 'hurricanes': 10,
 'is': 11,
 'leading': 12,
 'miami': 13,
 'of': 14,
 'program': 15,
 'represents': 16,
 'season': 17,
 'seminoles': 18,
 'state': 19,
 'teams.': 20,
 'terrible': 21,
 'the': 22,
 'to': 23,
 'university': 24}

In [57]:
# vectorization
sentence_vectors = {}
for sentence in sentences:
    tokens = sentence.lower().split()
    vector = np.zeros(len(vocabulary))

    # increment the count of the word in the vector
    for token in tokens:
        vector[word_to_index[token]] += 1
    sentence_vectors[sentence] = vector

# store in the vector store
for sentence, vector in sentence_vectors.items():
    vs.add_vector(sentence, vector)

In [58]:
i = 0
sentence_vectors[sentences[i]]

array([1., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 1., 1., 0., 1., 1., 0., 0.])

In [59]:
# query vector
query_sentence = "How many college football teams are in Florida?"

query_vector = np.zeros(len(vocabulary))
query_tokens = query_sentence.lower().split()

for token in query_tokens:
    if token in word_to_index:
        query_vector[word_to_index[token]] += 1


In [60]:
query_vector

array([0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0.])

In [61]:
vs.find_similar_vectors(query_vector, num_results=1)

[('Florida is home to eighteen college football teams.', 0.40824829046386296)]