# Magic BERT
#### Investigating the similarity of MtG flavor text with a large language model

We will be using the [DistilBERT](https://huggingface.co/docs/transformers/en/model_doc/distilbert) model from HuggingFace's Transformers package. DistilBERT is a relatively lightweight transformer model that still performs well at most benchmarking tasks. 

This notebook is based off of the [NLP Course](https://huggingface.co/learn/nlp-course/en) from HuggingFace and [this Geeks for Geeks post](https://www.geeksforgeeks.org/sentence-similarity-using-bert-transformer/).

WORK IN PROGRESS
See this sentence transformer documentation for next steps:
https://huggingface.co/sentence-transformers/msmarco-distilbert-base-v4

In [3]:
import numpy as np
import torch

# Create Model Object
from transformers import DistilBertModel
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

In [34]:
# Load card data
import pickle
datapath = 'card_data/CardData.pkl'
with open(datapath,'rb') as file:    
    data = pickle.load(file)
data = data.loc[:,['name','type_line','set','mana_cost','colors','flavor_text']].reset_index(drop=True)

In [40]:
# Test out tokenizing some text
test_flavor = data.loc[42,'flavor_text']
print(data.loc[42,'name'])
print(test_flavor)

Spark Spray :
it's the only kind of shower goblins will tolerate.


In [68]:
# Use HuggingFace's AutoTokenizer to select the tokenizer used for our pre-trained DistilBERT model
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

tokens = tokenizer.tokenize(test_flavor)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(tokens)
print(ids)

['it', "'", 's', 'the', 'only', 'kind', 'of', 'shower', 'goblin', '##s', 'will', 'tolerate', '.']
[2009, 1005, 1055, 1996, 2069, 2785, 1997, 6457, 22639, 2015, 2097, 19242, 1012]


In [76]:
# Format as a multidimensional tensor for use by the transformer
tokenized_inputs = tokenizer(test_flavor, return_tensors="pt")
print(tokenized_inputs)
print(type(tokens))

{'input_ids': tensor([[  101,  2009,  1005,  1055,  1996,  2069,  2785,  1997,  6457, 22639,
          2015,  2097, 19242,  1012,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
<class 'list'>


In [90]:
tokens = tokenizer(test_flavor, padding=True, truncation=True, return_tensors="pt")
print(type(tokens))
print(tokens)
output = model(**tokens)
print(output)
print(type(output))
# print(output.last_hidden_state[:,0,:])

<class 'transformers.tokenization_utils_base.BatchEncoding'>
{'input_ids': tensor([[  101,  2009,  1005,  1055,  1996,  2069,  2785,  1997,  6457, 22639,
          2015,  2097, 19242,  1012,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
BaseModelOutput(last_hidden_state=tensor([[[ 0.0616,  0.2137, -0.0266,  ..., -0.0677,  0.1408,  0.3962],
         [ 0.0283, -0.0310, -0.1482,  ...,  0.1277, -0.0508,  0.3766],
         [ 0.1092,  0.1527,  0.4453,  ..., -0.1133, -0.0174,  0.0917],
         ...,
         [ 0.1770,  0.6853,  0.3198,  ...,  0.1025, -0.4718,  0.3971],
         [ 0.8807,  0.3451, -0.3388,  ...,  0.0228, -0.5681, -0.4490],
         [ 0.8425,  0.7682,  0.2190,  ..., -0.0927, -0.4687, -0.0884]]],
       grad_fn=<NativeLayerNormBackward0>), hidden_states=None, attentions=None)
<class 'transformers.modeling_outputs.BaseModelOutput'>


In [133]:
# Write two similar sentences and measure their similarity:
sentence1 = "Goblins sure do smell bad."
sentence2 = "I'm glad I don't smell as bad as a goblin!"
sequences = [sentence1,sentence2]
tokens = tokenizer(sequences,padding=True,return_tensors='pt')
print(tokens)

{'input_ids': tensor([[  101, 22639,  2015,  2469,  2079,  5437,  2919,  1012,   102,     0,
             0,     0,     0,     0,     0,     0,     0],
        [  101,  1045,  1005,  1049,  5580,  1045,  2123,  1005,  1056,  5437,
          2004,  2919,  2004,  1037, 22639,   999,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [125]:
with torch.no_grad():
    outputs = model(**tokens)
    out1 = outputs.last_hidden_state[0,0,:]
    out2 = outputs.last_hidden_state[1,0,:]

from sklearn.metrics.pairwise import cosine_similarity
# cosine_similarity needs 2D arrays, so reshape our vectors accordingly:
out1 = out1.reshape(1,-1)
out2 = out2.reshape(1,-1)
similarity_score = cosine_similarity(out1,out2)
print(f"Similarity score = {similarity_score[0]}")

Similarity score = [0.9646025]


In [141]:
# Now let's compare some dissimilar statements:
sentence3 = "The forest is the most beautiful distillment of nature's bounty"
sentence4 = "Progress is forged on the anvil of pain"
sequences = [sentence1,sentence2,sentence3,sentence4]
tokens = tokenizer(sequences,padding=True,return_tensors='pt')
print(tokens)

{'input_ids': tensor([[  101, 22639,  2015,  2469,  2079,  5437,  2919,  1012,   102,     0,
             0,     0,     0,     0,     0,     0,     0],
        [  101,  1045,  1005,  1049,  5580,  1045,  2123,  1005,  1056,  5437,
          2004,  2919,  2004,  1037, 22639,   999,   102],
        [  101,  1996,  3224,  2003,  1996,  2087,  3376,  4487, 16643,  3363,
          3672,  1997,  3267,  1005,  1055, 17284,   102],
        [  101,  5082,  2003, 16158,  2006,  1996,  2019, 14762,  1997,  3255,
           102,     0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]])}


In [142]:
with torch.no_grad():
    outputs = model(**tokens)
    out3 = outputs.last_hidden_state[0,0,:].reshape(1,-1)
    out4 = outputs.last_hidden_state[1,0,:].reshape(1,-1)
similarity_score = cosine_similarity(out3,out4)
print(f"Similarity score = {similarity_score[0]}")

Similarity score = [0.9646025]


In [143]:
cosine_similarity(outputs.last_hidden_state[:,0,:])

array([[0.99999994, 0.9646025 , 0.88829386, 0.9054321 ],
       [0.9646025 , 1.0000002 , 0.8974154 , 0.9165001 ],
       [0.88829386, 0.8974154 , 1.        , 0.9296369 ],
       [0.9054321 , 0.9165001 , 0.9296369 , 0.9999999 ]], dtype=float32)