In [None]:
!pip3 install transformers

In [None]:
!pip3 install faiss-cpu

In [None]:
!pip3 install -U scikit-learn scipy matplotlib

In [None]:
!pip3 install networkx==3.1
!pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

In [2]:
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5')
model = AutoModel.from_pretrained('BAAI/bge-small-en-v1.5')

We'll need to load our player data from our scraped JSON file that stores MLB player data:

In [117]:
import json

json_file_path = 'cs242-project/myfirstproject/myfirstproject/mlb_data_adjusted_bios.json'

# load the JSON data from file
with open(json_file_path, 'r') as file:
    data = json.load(file)

# filter to include only dictionaries with non-empty 'biographical_information'
valid_entries = [entry for entry in data if 'biographical_information' in entry and entry['biographical_information'].strip()]

# create the sentences list from the filtered valid_entries list
sentences = [entry['biographical_information'] for entry in valid_entries]

Now we need to tokenize the above sentences and prepare them for input to the BERT model. We'll create a dictionary with two keys: `input_ids` and `attention_mask`. These will be populated with the tokenized versions of the sentences.

In [118]:
# initialize dictionary to store tokenized sentences
tokens = {'input_ids': [], 'attention_mask': []}

for sentence in sentences:
    # encode each sentence and append to dictionary
    new_tokens = tokenizer.encode_plus(sentence, max_length=512,
                                       truncation=True, padding='max_length',
                                       return_tensors='pt')
    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])

The next section will prepare the tokenized sentences for processing with the BERT model by reformatting the lists of tensors into single tensors.

In [119]:
# reformat list of tensors into single tensor
tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

Below, the BERT model will process the tokenized inputs:

In [120]:
with torch.no_grad():
    outputs = model(**tokens)
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

This will extract the embeddings from the `last_hidden_state` tensor from the model's outputs to the `embeddings` variable. `last_hidden_state` is a tensor containing the final layer's hidden states for each token in each input sequence.

In [121]:
embeddings = outputs.last_hidden_state
embeddings.shape

torch.Size([14, 512, 384])

In [122]:
embeddings

tensor([[[-1.9333e-01,  2.9218e-01, -3.1649e-02,  ..., -6.4065e-01,
           3.5040e-01,  1.7890e-01],
         [-2.8097e-01,  1.7903e-01,  3.7085e-01,  ..., -6.2551e-01,
           7.4291e-01,  6.9920e-01],
         [-3.7459e-01,  2.1804e-01,  3.3417e-01,  ..., -2.5003e-01,
           5.9015e-02, -4.2579e-01],
         ...,
         [-2.9276e-01,  3.5609e-02,  1.1313e-01,  ..., -4.1203e-01,
           4.9604e-01,  3.3489e-01],
         [-4.9367e-02,  1.4166e-01,  2.6685e-01,  ..., -5.0167e-01,
           6.2489e-01,  3.1618e-01],
         [-1.9331e-01,  2.9219e-01, -3.1652e-02,  ..., -6.4064e-01,
           3.5043e-01,  1.7888e-01]],

        [[-3.5371e-01, -1.2444e-02,  1.5510e-01,  ..., -4.7077e-01,
           2.7036e-01,  8.0258e-01],
         [-1.0474e-01, -1.9709e-02,  3.0545e-01,  ..., -5.2183e-02,
           6.7888e-01,  8.8091e-01],
         [ 3.6925e-02, -5.8432e-02,  4.1652e-01,  ..., -2.8672e-01,
           1.3656e+00, -1.9288e-01],
         ...,
         [-3.5351e-01, -1

After we have produced our dense vectors embeddings, we need to perform a mean pooling operation to create a single vector encoding (the sentence embedding). To do this mean pooling operation, we will need to multiply each value in our embeddings tensor by its respective attention_mask value — so that we ignore non-real tokens.

In [123]:
# resize our attention_mask tensor:
attention_mask = tokens['attention_mask']
attention_mask.shape

torch.Size([14, 512])

In [124]:
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
mask.shape

torch.Size([14, 512, 384])

Each vector above represents a single token attention mask - each token now has a vector of size 768 representing it's attention_mask status. Then we multiply the two tensors to apply the attention mask:

In [125]:
masked_embeddings = embeddings * mask
masked_embeddings.shape

torch.Size([14, 512, 384])

"Mean Pooling" starts

In [126]:
# Then we sum the remained of the embeddings along axis 1, because we want to reduce the 512 tokens to 1 dimension
summed = torch.sum(masked_embeddings, 1)
summed.shape

torch.Size([14, 384])

we want to count only those values that we want to give attention
then divide by the sum to get the mean

In [127]:
# clamp returns the same tensor with a range given, clamp is used to replace the zeros to a very minimal value
# to avoid divide by zero error
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
summed_mask.shape

torch.Size([14, 384])

Finally, we calculate the mean as the sum of the embedding activations summed divided by the number of values that should be given attention in each position `summed_mask`:

In [128]:
mean_pooled = summed / summed_mask

`mean_pooled` is the final "dense representation" of the sentences, note that mean_pooled contains all representations for all sentences together

In [129]:
mean_pooled

tensor([[-0.1845,  0.3163,  0.2080,  ..., -0.6100,  0.4914,  0.1752],
        [-0.2176,  0.0125,  0.3889,  ..., -0.4580,  0.4477,  0.6827],
        [-0.3200,  0.2935, -0.0938,  ..., -0.5611,  0.1503,  0.4513],
        ...,
        [-0.4644, -0.0700,  0.4939,  ..., -0.4268,  0.3867,  0.2270],
        [-0.1985, -0.1751, -0.6110,  ...,  0.2191,  0.7615,  0.3476],
        [-0.1451,  0.0103, -0.0597,  ...,  0.1133,  0.3767,  0.8671]])

## Cosine Similarity

In [130]:
def convert_to_embedding(query):
    tokens = {'input_ids': [], 'attention_mask': []}
    new_tokens = tokenizer.encode_plus(query, max_length=512,
                                       truncation=True, padding='max_length',
                                       return_tensors='pt')
    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])
    tokens['input_ids'] = torch.stack(tokens['input_ids'])
    tokens['attention_mask'] = torch.stack(tokens['attention_mask'])
    with torch.no_grad():
        outputs = model(**tokens)
    embeddings = outputs.last_hidden_state
    attention_mask = tokens['attention_mask']
    mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
    masked_embeddings = embeddings * mask
    summed = torch.sum(masked_embeddings, 1)
    summed_mask = torch.clamp(mask.sum(1), min=1e-9)
    mean_pooled = summed / summed_mask
    
    return mean_pooled[0] # assuming query is a single sentence
    

In [131]:
from sklearn.metrics.pairwise import cosine_similarity

In [178]:
query = "Yelich"
query_embedding = convert_to_embedding(query)

In [179]:
mean_pooled.shape

torch.Size([14, 384])

In [180]:
cos = torch.nn.CosineSimilarity()
sim = cos(query_embedding, mean_pooled)
sim

tensor([0.5504, 0.4686, 0.4785, 0.4296, 0.5287, 0.5336, 0.6901, 0.6901, 0.4773,
        0.5255, 0.5255, 0.5352, 0.4463, 0.5299])

# FAISS

In [181]:
import faiss                   # make faiss available
index = faiss.IndexFlatIP(384) # build the index
print(index.is_trained)
index.add(mean_pooled)         # add vectors to the index
print(index.ntotal)

True
14


In [182]:
mean_pooled.shape

torch.Size([14, 384])

In [183]:
query_embedding.shape

torch.Size([384])

In [184]:
D, I = index.search(query_embedding[None, :], 1)

In [185]:
D

array([[46.861103]], dtype=float32)

In [186]:
I

array([[6]])

In [187]:
faiss.write_index(index,"sample_code.index")

In [188]:
index_loaded = faiss.read_index("sample_code.index")

In [189]:
D, I = index_loaded.search(query_embedding[None, :], 4)

In [190]:
D

array([[46.861103, 46.861103, 37.467735, 36.620792]], dtype=float32)

In [191]:
I

array([[7, 6, 0, 4]])

Let's print out the results of the highest scoring document as a result of our query:

In [192]:
most_similar_document_index = I[0][0]  # access the first (and only) element of I since we searched for top 1 document
print(I[0][0])
most_similar_document = valid_entries[most_similar_document_index]["biographical_information"]
print("Most similar document based on the query:", most_similar_document)

7
Most similar document based on the query: Christian Yelich made his major league debut in 2013 . He hit .455 as a high school senior , with 14 doubles and 9 home runs in 77 at-bats ; he also stole 27 bases . Baseball America named him a second-team high school All-American infielder , behind first-teamers Sean Coyle , Nick Delmonico , Manny Machado and Chris Hawkins . He was the first first baseman taken in the 2010 amateur draft . He went 23rd overall , to the Florida Marlins ; the scout was Tim McDonnell . He was converted to the outfield after signing with the Marlins ( for a $ 1.7 million bonus ) and began his pro career in 2010 . He showed he belonged right away , going 3 for 3 with a triple , two runs and two RBI in his pro debut . He saw limited action for the GCL Marlins ( 9 for 24 , 2B , 3B , 2 BB , SB ) and Greensboro Grasshoppers ( 8 for 23 , 2 2B , BB ) that summer . A teenager in a full-season league in 2011 , he more than held his own for Greensboro , hitting .312/.388/