<a href="https://colab.research.google.com/github/davidisinta/AI/blob/main/law_llms2_12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WordPiece tokenization

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [5]:
!pip install datasets evaluate transformers[sentencepiece]



In [6]:
corpus = []

with open("court_of_appeal.txt", "r", encoding="utf-8") as file:
    for line in file:
        cleaned_line = line.strip()  # Remove leading/trailing whitespace
        if cleaned_line:  # Ignore empty lines
            corpus.append(cleaned_line)


In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [8]:
from collections import defaultdict

word_freqs = defaultdict(int)
for text in corpus:
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1

In [9]:
alphabet = []
for word in word_freqs.keys():
    if word[0] not in alphabet:
        alphabet.append(word[0])
    for letter in word[1:]:
        if f"##{letter}" not in alphabet:
            alphabet.append(f"##{letter}")

alphabet.sort()
alphabet

print(alphabet)

['"', '##0', '##1', '##2', '##3', '##4', '##5', '##6', '##7', '##8', '##9', '##A', '##B', '##C', '##D', '##E', '##F', '##H', '##I', '##J', '##L', '##M', '##O', '##P', '##R', '##S', '##T', '##U', '##V', '##a', '##b', '##c', '##d', '##e', '##f', '##g', '##h', '##i', '##j', '##k', '##l', '##m', '##n', '##o', '##p', '##q', '##r', '##s', '##t', '##u', '##v', '##w', '##x', '##y', '##z', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z']


In [10]:
vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] + alphabet.copy()

print(len(vocab))

136


In [11]:
splits = {
    word: [c if i == 0 else f"##{c}" for i, c in enumerate(word)]
    for word in word_freqs.keys()
}

In [12]:
def compute_pair_scores(splits):
    letter_freqs = defaultdict(int)
    pair_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        split = splits[word]
        if len(split) == 1:
            letter_freqs[split[0]] += freq
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            letter_freqs[split[i]] += freq
            pair_freqs[pair] += freq
        letter_freqs[split[-1]] += freq

    scores = {
        pair: freq / (letter_freqs[pair[0]] * letter_freqs[pair[1]])
        for pair, freq in pair_freqs.items()
    }
    return scores

In [13]:
def merge_pair(a, b, splits):
    for word in word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue
        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                merge = a + b[2:] if b.startswith("##") else a + b
                split = split[:i] + [merge] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits

In [14]:
def tokenize(text):
    pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in pre_tokenize_result]
    encoded_words = [encode_word(word) for word in pre_tokenized_text]
    return sum(encoded_words, [])

In [15]:
vocab_size = 5000

print(len(vocab))

while len(vocab) < vocab_size:
    scores = compute_pair_scores(splits)
    best_pair, max_score = "", None
    for pair, score in scores.items():
        if max_score is None or max_score < score:
            best_pair = pair
            max_score = score
    splits = merge_pair(*best_pair, splits)
    new_token = (
        best_pair[0] + best_pair[1][2:]
        if best_pair[1].startswith("##")
        else best_pair[0] + best_pair[1]
    )
    vocab.append(new_token)


print(len(vocab))

136
5000


In [16]:
print(vocab)

['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '"', '##0', '##1', '##2', '##3', '##4', '##5', '##6', '##7', '##8', '##9', '##A', '##B', '##C', '##D', '##E', '##F', '##H', '##I', '##J', '##L', '##M', '##O', '##P', '##R', '##S', '##T', '##U', '##V', '##a', '##b', '##c', '##d', '##e', '##f', '##g', '##h', '##i', '##j', '##k', '##l', '##m', '##n', '##o', '##p', '##q', '##r', '##s', '##t', '##u', '##v', '##w', '##x', '##y', '##z', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', '##HE', '##PM', '##LJ', '##UM', '##LUM', '##OLUM', '##US', '##ST', '##OR', '##STR', '##LR', '##RD', '##ER', '##LRA', '##USA', '##AA', '##CU', '##RCU', '##RDC', '##ERC', '##CT', 'NRDC', 'OF',

In [17]:
def encode_word(word):
    tokens = []
    while len(word) > 0:
        i = len(word)
        while i > 0 and word[:i] not in vocab:
            i -= 1
        if i == 0:
            return ["[UNK]"]
        tokens.append(word[:i])
        word = word[i:]
        if len(word) > 0:
            word = f"##{word}"
    return tokens

In [18]:
tokenize("This is a test, this is very nice")

['This',
 'i',
 '##s',
 'a',
 't',
 '##e',
 '##s',
 '##t',
 ',',
 'th',
 '##i',
 '##s',
 'i',
 '##s',
 'v',
 '##e',
 '##r',
 '##y',
 'n',
 '##ic',
 '##e']

In [19]:
with open("court_of_appeal.txt", "r", encoding="utf-8") as file:
    text = file.read()


court_of_appeal_tokens = tokenize(text)

i = 0
for token in court_of_appeal_tokens:
    if i < 10:
        print(token)
        i += 1


Unit
##e
##d
S
##t
##a
##t
##e
##s
Court


In [20]:
def populate_corpus(file_path):
  corpus = []
  with open(file_path, "r", encoding="utf-8") as file:
    for line in file:
        cleaned_line = line.strip()  # Remove leading/trailing whitespace
        if cleaned_line:  # Ignore empty lines
            corpus.append(cleaned_line)
  print(corpus)

  return corpus

In [21]:
def analyze_word_freqs(corpus):
  word_freqs = defaultdict(int)
  for text in corpus:
      words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
      new_words = [word for word, offset in words_with_offsets]
      for word in new_words:
          word_freqs[word] += 1
  print(word_freqs)
  return word_freqs


In [22]:
def generate_alphabet(word_freqs):
  alphabet = []
  for word in word_freqs.keys():
      if word[0] not in alphabet:
          alphabet.append(word[0])
      for letter in word[1:]:
          if f"##{letter}" not in alphabet:
              alphabet.append(f"##{letter}")

  alphabet.sort()
  alphabet
  print(alphabet)
  return alphabet

In [23]:
def generate_vocab(alphabet):
  vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] + alphabet.copy()
  print(len(vocab))
  return vocab

In [24]:
def generate_splits(word_freqs):
  splits = {
    word: [c if i == 0 else f"##{c}" for i, c in enumerate(word)]
    for word in word_freqs.keys()
}
  return splits

In [25]:
def train_model(vocab, vocab_size, splits):

  while len(vocab) < vocab_size:
      scores = compute_pair_scores(splits)
      best_pair, max_score = "", None
      for pair, score in scores.items():
          if max_score is None or max_score < score:
              best_pair = pair
              max_score = score
      splits = merge_pair(*best_pair, splits)
      new_token = (
          best_pair[0] + best_pair[1][2:]
          if best_pair[1].startswith("##")
          else best_pair[0] + best_pair[1]
      )
      vocab.append(new_token)

  return vocab

In [26]:
#training and utilization of wizard of oz
corpus = populate_corpus("wizard_of_oz.txt")
word_freqs = analyze_word_freqs(corpus)
alphabet = generate_alphabet(word_freqs)
vocab = generate_vocab(alphabet)
splits = generate_splits(word_freqs)
wizard_oz_model = train_model(vocab, 5000, splits)

['The Project Gutenberg eBook of The Wonderful Wizard of Oz', 'This ebook is for the use of anyone anywhere in the United States and', 'most other parts of the world at no cost and with almost no restrictions', 'whatsoever. You may copy it, give it away or re-use it under the terms', 'of the Project Gutenberg License included with this ebook or online', 'at www.gutenberg.org. If you are not located in the United States,', 'you will have to check the laws of the country where you are located', 'before using this eBook.', 'Title: The Wonderful Wizard of Oz', 'Author: L. Frank Baum', 'Release date: February 1, 1993 [eBook #55]', 'Most recently updated: December 29, 2024', 'Language: English', '*** START OF THE PROJECT GUTENBERG EBOOK THE WONDERFUL WIZARD OF OZ ***', '[Illustration]', 'The Wonderful Wizard of Oz', 'by L. Frank Baum', 'This book is dedicated to my good friend & comrade', 'My Wife', 'L.F.B.', 'Contents', 'Introduction', 'Chapter I. The Cyclone', 'Chapter II. The Council with

In [27]:
with open("wizard_of_oz.txt", "r", encoding="utf-8") as file:
    text = file.read()

wizard_of_oz_tokens = tokenize(text)

print(wizard_of_oz_tokens)

['Th', '##e', 'Proj', '##e', '##ct', 'Gut', '##e', '##nb', '##e', '##r', '##g', 'eBook', 'of', 'Th', '##e', 'Wond', '##e', '##rful', 'Wizard', 'of', 'Oz', 'This', 'ebook', 'is', 'for', 'th', '##e', 'us', '##e', 'of', 'anyon', '##e', 'anywh', '##e', '##r', '##e', 'in', 'th', '##e', 'Unit', '##e', '##d', 'Stat', '##e', '##s', 'and', 'most', 'oth', '##e', '##r', 'part', '##s', 'of', 'th', '##e', 'world', 'a', '##t', 'no', 'cost', 'and', 'with', 'almost', 'no', 'r', '##e', '##strictions', 'whatso', '##e', '##v', '##e', '##r', '.', 'You', 'may', 'copy', 'it', ',', 'giv', '##e', 'it', 'away', 'or', 'r', '##e', '-', 'us', '##e', 'it', 'und', '##e', '##r', 'th', '##e', 't', '##e', '##r', '##m', '##s', 'of', 'th', '##e', 'Proj', '##e', '##ct', 'Gut', '##e', '##nb', '##e', '##r', '##g', 'Lic', '##e', '##n', '##s', '##e', 'includ', '##e', '##d', 'with', 'this', 'ebook', 'or', 'onlin', '##e', 'a', '##t', 'www', '.', 'gut', '##e', '##nb', '##e', '##r', '##g', '.', 'org', '.', 'If', 'you', 'a', '##r

In [28]:
print(type(wizard_of_oz_tokens))

<class 'list'>


In [29]:
print(len(wizard_of_oz_tokens))

95456


In [30]:
print(len(court_of_appeal_tokens))

47927


In [31]:
for i in range(10):
  print(wizard_of_oz_tokens[i])

Th
##e
Proj
##e
##ct
Gut
##e
##nb
##e
##r


In [32]:
unique_tokens_oz = set(wizard_of_oz_tokens)
print(len(unique_tokens_oz))

2351


In [33]:
unique_tokens_court = set(court_of_appeal_tokens)
print(len(unique_tokens_court))

2309


In [34]:
common_tokens = unique_tokens_oz & unique_tokens_court
print(f"Tokens in both sets ({len(common_tokens)}):")
i = 0
for token in common_tokens:
    if i < 10:
        print(token)
        i += 1

Tokens in both sets (621):
Unl
To
809
making
limitation
50
physical
801
using
polic


In [35]:
# Tokens only in Wizard of Oz
only_in_oz = unique_tokens_oz - unique_tokens_court
print(f"\nTokens only in Wizard of Oz ({len(only_in_oz)}):")
# for token in only_in_oz:
#     print(token)


Tokens only in Wizard of Oz (1730):


In [36]:
# Tokens only in Court of Appeal
only_in_court = unique_tokens_court - unique_tokens_oz
print(f"\nTokens only in Court of Appeal ({len(only_in_court)}):")
# for token in only_in_court:
#     print(token)


Tokens only in Court of Appeal (1688):


Beginning of PSET 2!!

Now We want to Make use of CBOW on both legal and Wizard of Oz tokens

In [37]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define CBOW model
class CBOWModel(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(CBOWModel, self).__init__()
        # Embedding layer: Maps words to dense vectors
        self.embeddings = nn.Embedding(vocab_size, embed_size)
        # Linear layer: Maps the summed embeddings to vocab space for prediction
        self.linear = nn.Linear(embed_size, vocab_size)

    def forward(self, context):
        """
        Forward pass of CBOW model
        :param context: Tensor of word indices forming the context
        :return: Logits for vocabulary words
        """
        # Lookup embeddings for context words and sum them up
        context_embeds = self.embeddings(context).sum(dim=1)  # Shape: (batch_size, embed_size)
        # Pass the summed embeddings through the linear layer
        output = self.linear(context_embeds)  # Shape: (batch_size, vocab_size)
        return output


In [38]:
def tokensToVector(model_name, tokens, epochs = 500):
  # Define the context size (number of words before and after target word)
  context_size = 10

  # Load and preprocess data
  tokens = list(tokens)

  #vocab is the unique words, since we pass in the unique tokens
  # they are the same
  vocab = tokens

  # Mapping words to indices
  word_to_index = {word: i for i, word in enumerate(vocab)}

  # Prepare training data
  data = []
  for i in range(context_size, len(tokens) - context_size):
      # Select 10 words before and 10 words after the target word
      context = [word_to_index[word] for word in (tokens[i - context_size:i] + tokens[i + 1:i + context_size + 1])]

      # Target word is the word in the middle
      target = word_to_index[tokens[i]]
      data.append((torch.tensor(context), torch.tensor(target)))

  # Hyperparameters
  vocab_size = len(vocab)
  embed_size = 10
  learning_rate = 0.01

  if model_name == "cbow":
    # Initialize CBOW model
    cbow_model = CBOWModel(vocab_size, embed_size)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(cbow_model.parameters(), lr=learning_rate)

    # Training loop
    for epoch in range(epochs):
        total_loss = 0
        for context, target in data:
            optimizer.zero_grad()
            output = cbow_model(context.unsqueeze(0))
            loss = criterion(output, target.unsqueeze(0))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch + 1}, Loss: {total_loss}")

    print("\nWord Embeddings after Training:")
    sample_words = list(vocab)[:5]

    for word in sample_words:
        word_index = word_to_index[word]
        embedding = cbow_model.embeddings(torch.tensor([word_index])).detach().numpy()
        print(f"Embedding for '{word}': {embedding}")


  return word_to_index, vocab, vocab_size, cbow_model.embeddings


In [39]:
cbow_wizardof_oz_result = tokensToVector("cbow", unique_tokens_oz, epochs = 300)

Epoch 1, Loss: 23897.12065601349
Epoch 2, Loss: 18865.728188991547
Epoch 3, Loss: 15717.363018989563
Epoch 4, Loss: 13532.189851760864
Epoch 5, Loss: 11915.749710083008
Epoch 6, Loss: 10660.318276762962
Epoch 7, Loss: 9648.694600701332
Epoch 8, Loss: 8811.397227406502
Epoch 9, Loss: 8104.337235093117
Epoch 10, Loss: 7497.973289489746
Epoch 11, Loss: 6971.536922514439
Epoch 12, Loss: 6509.677994787693
Epoch 13, Loss: 6100.567340552807
Epoch 14, Loss: 5734.891860842705
Epoch 15, Loss: 5405.280753314495
Epoch 16, Loss: 5105.870178818703
Epoch 17, Loss: 4831.963195443153
Epoch 18, Loss: 4579.7962809205055
Epoch 19, Loss: 4346.3568686544895
Epoch 20, Loss: 4129.224882602692
Epoch 21, Loss: 3926.438566237688
Epoch 22, Loss: 3736.391959667206
Epoch 23, Loss: 3557.758839428425
Epoch 24, Loss: 3389.437775403261
Epoch 25, Loss: 3230.5062970519066
Epoch 26, Loss: 3080.1882348656654
Epoch 27, Loss: 2937.822212576866
Epoch 28, Loss: 2802.8401940762997
Epoch 29, Loss: 2674.749083250761
Epoch 30, Los

In [40]:
cbow_court_result = tokensToVector("cbow", unique_tokens_court, epochs = 300)

Epoch 1, Loss: 23360.079651594162
Epoch 2, Loss: 18625.76502609253
Epoch 3, Loss: 15616.489369869232
Epoch 4, Loss: 13493.675408363342
Epoch 5, Loss: 11904.538829088211
Epoch 6, Loss: 10660.944940567017
Epoch 7, Loss: 9653.101948738098
Epoch 8, Loss: 8814.601304650307
Epoch 9, Loss: 8103.236809253693
Epoch 10, Loss: 7490.639658093452
Epoch 11, Loss: 6956.800292134285
Epoch 12, Loss: 6486.981072068214
Epoch 13, Loss: 6069.919672369957
Epoch 14, Loss: 5696.729587674141
Epoch 15, Loss: 5360.272729277611
Epoch 16, Loss: 5054.797064065933
Epoch 17, Loss: 4775.67677795887
Epoch 18, Loss: 4519.188277840614
Epoch 19, Loss: 4282.318504810333
Epoch 20, Loss: 4062.607581079006
Epoch 21, Loss: 3858.026792138815
Epoch 22, Loss: 3666.8837189376354
Epoch 23, Loss: 3487.7549990713596
Epoch 24, Loss: 3319.4369193911552
Epoch 25, Loss: 3160.9106660485268
Epoch 26, Loss: 3011.3124030828476
Epoch 27, Loss: 2869.907562792301
Epoch 28, Loss: 2736.0674277842045
Epoch 29, Loss: 2609.2493368387222
Epoch 30, Lo

In [41]:
import torch
import numpy as np

In [42]:
def cosine_similarity(vec1, vec2):
    """
    Calculate cosine similarity between two vectors
    """
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)


Calculate Cosine Similarity of different tokens

In [43]:
def compute_cosine_similarity(vocab, word_index_map, embeddings):
  vocab_list = list(word_index_map.keys())
  num_words = len(vocab_list)
  print(f"num of words {num_words}")


  # Initialize similarity matrix
  similarity_matrix = np.zeros((num_words, num_words))

  # Compute pairwise similarity
  for i in range(num_words):
      for j in range(num_words):
          if i == j:
              similarity_matrix[i, j] = 1.0
          else:
              vec1 = embeddings(torch.tensor([word_index_map[vocab_list[i]]])).detach().numpy().flatten()
              vec2 = embeddings(torch.tensor([word_index_map[vocab_list[j]]])).detach().numpy().flatten()
              similarity_matrix[i, j] = cosine_similarity(vec1, vec2)

  # Print similarity matrix for first 5 occuring tokens
  print("\nWord Similarity Matrix (Top 5 Words):")
  for i in range(min(5, num_words)):
      print(f"{vocab_list[i]:<12}:", " ".join(f"{similarity_matrix[i, j]:.2f}" for j in range(min(5, num_words))))

  return similarity_matrix


In [44]:
cosine_input = cbow_wizardof_oz_result
word_to_index_map = cosine_input[0]
vocab = cosine_input[1]
vocab_size = cosine_input[2]
embeddings = cosine_input[3]

wizard_of_oz_similarity_matrix = compute_cosine_similarity(vocab, word_to_index_map, embeddings)

num of words 2351

Word Similarity Matrix (Top 5 Words):
thoughtful  : 1.00 0.31 -0.02 0.32 -0.39
rob         : 0.31 1.00 0.07 -0.33 0.14
strips      : -0.02 0.07 1.00 0.29 0.52
cornfi      : 0.32 -0.33 0.29 1.00 0.18
bottom      : -0.39 0.14 0.52 0.18 1.00


In [45]:
cosine_input2 = cbow_court_result
word_to_index_map2 = cosine_input2[0]
vocab2 = cosine_input2[1]
vocab_size2 = cosine_input2[2]
embeddings2 = cosine_input2[3]

print(vocab2)
print(word_to_index_map2)
print(vocab_size2)
print(len(vocab2))

court_similarity_matrix = compute_cosine_similarity(vocab2, word_to_index_map2, embeddings2)

['553', 'tation', '621', 'Honig', 'polic', 'pow', '##ncy', '##gulat', 'constitution', 'fi', 'stitutional', 'calibrat', '24', 'willing', '001', '##nial', 'two', 'Th', 'blud', 'compos', '121', '##aj', 'tration', 'Court', 'indulg', 'Transmission', 'manag', 'employ', 'Sp', 'complaint', 'withhold', 'suffici', 'infliction', 'Judi', 'still', 'particular', 'v', 'Hous', 'applicability', 'Evid', '20', 'participat', 'Mich', 'Jacob', 'sanctions', 'full', 'envision', 'quir', '##sponding', 'altog', 'Hamiltonian', 'aggrandiz', '##chanism', 'improv', 'CA4', 'principl', '##thing', '654', 'discussing', 'explicit', 'supra', '1971', '768', '7th', '1507', '1038', '##gal', '43', 'Law', 'claim', '196', 'mosph', 'unc', 'political', 'Harriss', 'f', 'absurd', 'pr', 'socially', 'unconn', '38', 'explanation', '##gativ', '32', '##coming', 'passag', '481', 'occasional', '##mis', 'scant', 'front', 'Val', 'proposition', 'Pr', 'tabl', 'taking', '##ously', 'controv', 'al', '##parabl', 'Christ', '402', 'conspiring', 'so

In [46]:
!pip install gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt') # Download the tokenizer models if not already downloaded

tokenized_corpus = unique_tokens_oz

skipgram_model = Word2Vec(sentences=[tokenized_corpus],
						vector_size=100, # Dimensionality of the word vectors
						window=5,		 # Maximum distance between the current and predicted word within a sentence
						sg=1,			 # Skip-Gram model (1 for Skip-Gram, 0 for CBOW)
						min_count=1,	 # Ignores all words with a total frequency lower than this
						workers=4)	 # Number of CPU cores to use for training the model

# Training
skipgram_model.train([tokenized_corpus], total_examples=1, epochs=10)
skipgram_model.save("skipgram_model.model")
loaded_model = Word2Vec.load("skipgram_model.model")
vector_representation = loaded_model.wv['word']
print("Vector representation of 'word':", vector_representation)




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Vector representation of 'word': [ 8.2368497e-03 -8.8071348e-03  8.7069450e-03 -2.8058644e-03
  1.2951762e-03 -1.0196010e-02 -9.8477844e-03  7.9350844e-03
 -4.7674384e-03  7.8753317e-03 -1.6648249e-03  5.3632343e-03
 -5.5192113e-03  3.1825856e-03 -9.6821720e-03  7.7819577e-03
  9.6528744e-03  6.1155921e-03 -7.3172650e-03  8.3201146e-03
  4.5563886e-04 -1.0733882e-02 -4.2400113e-03  1.1654615e-03
  6.0512970e-04 -8.3995182e-03 -4.5882021e-03 -1.5760470e-03
  1.4405140e-04 -6.5565901e-03  7.6375245e-03 -4.5728716e-03
 -2.3335274e-03  4.1756085e-03  4.8737265e-03  1.0770585e-04
  3.6397253e-04  3.1986199e-03  1.5739360e-04  3.3117477e-03
 -5.2760420e-03 -4.2750253e-05 -8.0293734e-03  4.9606655e-03
  1.5349000e-03 -7.6633063e-03  2.7381578e-03 -6.0321144e-03
  6.2992348e-04 -3.6681262e-03  8.4674871e-03  7.8716697e-03
 -4.3236688e-03 -6.1473185e-03 -1.9297421e-03  7.8607732e-03
  1.7807382e-03  1.2756066e-03 -2.3521446e-03  2.5587785e-04
  6.2825734e-04  1.1816077e-03  6.9248392e-03  7.909

In [47]:
!pip install gensim nltk torch

import torch
import torch.nn as nn
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [48]:

def train_word_embeddings(tokens, vector_size=10, window=5, min_count=1, epochs=200, workers=4):
    """
    Trains a word embedding model using CBOW or Skip-Gram.

    """

    # Ensure tokens are in list format
    if isinstance(tokens, str):
        tokens = word_tokenize(tokens)

    # Create vocabulary and word-to-index mapping
    vocab = set(tokens)
    word_to_index = {word: i for i, word in enumerate(vocab)}

    # Train Word2Vec model
    model = Word2Vec(sentences=[tokens], vector_size=vector_size, window=window, sg=1,
                     min_count=min_count, workers=workers)

    # Train model
    model.train([tokens], total_examples=1, epochs=epochs)

    # Retrieve trained embeddings as a PyTorch tensor
    vocab_size = len(vocab)
    embeddings = torch.tensor([model.wv[word] for word in vocab], dtype=torch.float)

    return word_to_index, vocab, vocab_size, embeddings



In [49]:
word_to_index3, vocab3, vocab_size3, embeddings3 = train_word_embeddings(unique_tokens_oz)

  embeddings = torch.tensor([model.wv[word] for word in vocab], dtype=torch.float)


In [50]:
word_to_index4, vocab4, vocab_size4, embeddings4 = train_word_embeddings(unique_tokens_court)



In [51]:
import torch
import torch.nn.functional as F

def compute_cosine_similarity2(vocab, word_index_map, embeddings):
    """
    Compute cosine similarity between different tokens.
    """
    vocab_list = list(word_index_map.keys())
    num_words = len(vocab_list)
    print(f"Number of words: {num_words}")

    # Initialize similarity matrix
    similarity_matrix = torch.zeros((num_words, num_words))

    # Compute pairwise similarity
    for i in range(num_words):
        for j in range(num_words):
            if i == j:
                similarity_matrix[i, j] = 1.0
            else:
                # Retrieve word indices
                idx1 = word_index_map[vocab_list[i]]
                idx2 = word_index_map[vocab_list[j]]

                # Get word embeddings
                vec1 = embeddings[idx1]
                vec2 = embeddings[idx2]

                # Compute cosine similarity
                similarity_matrix[i, j] = F.cosine_similarity(vec1.unsqueeze(0), vec2.unsqueeze(0))

    # Print similarity matrix for the first 5 occurring tokens
    print("\nWord Similarity Matrix (Top 5 Words):")
    for i in range(min(5, num_words)):
        print(f"{vocab_list[i]:<12}:", " ".join(f"{similarity_matrix[i, j]:.2f}" for j in range(min(5, num_words))))

    return similarity_matrix


In [52]:
res = compute_cosine_similarity2(vocab3, word_to_index3, embeddings3)

Number of words: 2351

Word Similarity Matrix (Top 5 Words):
thoughtful  : 1.00 1.00 1.00 1.00 0.99
rob         : 1.00 1.00 1.00 1.00 1.00
strips      : 1.00 1.00 1.00 1.00 1.00
cornfi      : 1.00 1.00 1.00 1.00 1.00
bottom      : 0.99 1.00 1.00 1.00 1.00


In [53]:
res2 = compute_cosine_similarity2(vocab4, word_to_index4, embeddings4)

Number of words: 2309

Word Similarity Matrix (Top 5 Words):
553         : 1.00 1.00 1.00 0.99 0.98
tation      : 1.00 1.00 1.00 1.00 0.99
621         : 1.00 1.00 1.00 1.00 0.99
Honig       : 0.99 1.00 1.00 1.00 1.00
polic       : 0.98 0.99 0.99 1.00 1.00


In [54]:
import numpy as np
import torch

def find_discrepant_pairs(similarity_matrix_skipgram, similarity_matrix_cbow, index_to_word, skipgram_threshold=0.8, cbow_threshold=-0.4):
    """
    Identify pairs of words that are highly similar in one embedding (e.g., Skipgram) but not in the other (e.g., CBOW).
    """
    discrepant_pairs = []

    # Iterate through all pairs of words
    for i in range(similarity_matrix_skipgram.shape[0]):
        for j in range(i + 1, similarity_matrix_skipgram.shape[1]):
            skipgram_sim = similarity_matrix_skipgram[i, j]
            cbow_sim = similarity_matrix_cbow[i, j].item()  # Convert tensor to float

            # Check if the pair meets the specified conditions
            if skipgram_sim >= skipgram_threshold and cbow_sim <= cbow_threshold:
                word1 = index_to_word.get(i, f"Word_{i}")
                word2 = index_to_word.get(j, f"Word_{j}")
                discrepant_pairs.append((word1, word2, skipgram_sim, cbow_sim))

    return discrepant_pairs


In [55]:
# Create index_to_word by reversing the word_to_index dictionary
index_to_word3 = {index: word for word, index in word_to_index4.items()}


discrepant_pairs = find_discrepant_pairs(res, wizard_of_oz_similarity_matrix, index_to_word3)
print("Discrepant Pairs:")
for pair in discrepant_pairs:
    print(f"Words: {pair[0]} and {pair[1]}, Skipgram Similarity: {pair[2]:.2f}, CBOW Similarity: {pair[3]:.2f}")


Discrepant Pairs:
Words: 553 and fi, Skipgram Similarity: 0.95, CBOW Similarity: -0.43
Words: 553 and stitutional, Skipgram Similarity: 0.93, CBOW Similarity: -0.42
Words: 553 and egr, Skipgram Similarity: 0.80, CBOW Similarity: -0.41
Words: 553 and sanction, Skipgram Similarity: 0.80, CBOW Similarity: -0.58
Words: 553 and scholarly, Skipgram Similarity: 0.81, CBOW Similarity: -0.43
Words: 553 and 845, Skipgram Similarity: 0.80, CBOW Similarity: -0.80
Words: 553 and excluding, Skipgram Similarity: 0.90, CBOW Similarity: -0.65
Words: 553 and On, Skipgram Similarity: 0.85, CBOW Similarity: -0.60
Words: 553 and ##iv, Skipgram Similarity: 0.84, CBOW Similarity: -0.59
Words: tation and fi, Skipgram Similarity: 0.95, CBOW Similarity: -0.57
Words: tation and ##nial, Skipgram Similarity: 0.80, CBOW Similarity: -0.41
Words: tation and scholarly, Skipgram Similarity: 0.82, CBOW Similarity: -0.53
Words: tation and 845, Skipgram Similarity: 0.81, CBOW Similarity: -0.55
Words: tation and excluding,

In [56]:
# Create index_to_word by reversing the word_to_index dictionary
index_to_word4 = {index: word for word, index in word_to_index4.items()}

discrepant_pairs = find_discrepant_pairs(res2, court_similarity_matrix, index_to_word4)
print("Discrepant Pairs:")
for pair in discrepant_pairs:
    print(f"Words: {pair[0]} and {pair[1]}, Skipgram Similarity: {pair[2]:.2f}, CBOW Similarity: {pair[3]:.2f}")

Discrepant Pairs:
Words: 553 and submission, Skipgram Similarity: 0.87, CBOW Similarity: -0.42
Words: 553 and 12, Skipgram Similarity: 0.84, CBOW Similarity: -0.58
Words: 553 and authorizing, Skipgram Similarity: 0.83, CBOW Similarity: -0.59
Words: 553 and ##alth, Skipgram Similarity: 0.83, CBOW Similarity: -0.40
Words: 553 and pass, Skipgram Similarity: 0.85, CBOW Similarity: -0.54
Words: 553 and Cl, Skipgram Similarity: 0.85, CBOW Similarity: -0.51
Words: 553 and 636, Skipgram Similarity: 0.87, CBOW Similarity: -0.58
Words: tation and pow, Skipgram Similarity: 0.98, CBOW Similarity: -0.40
Words: tation and constitution, Skipgram Similarity: 0.88, CBOW Similarity: -0.52
Words: tation and fi, Skipgram Similarity: 0.83, CBOW Similarity: -0.53
Words: tation and 12, Skipgram Similarity: 0.85, CBOW Similarity: -0.63
Words: tation and footnot, Skipgram Similarity: 0.81, CBOW Similarity: -0.41
Words: 621 and constitution, Skipgram Similarity: 0.88, CBOW Similarity: -0.57
Words: 621 and 217, 

In [57]:
import numpy as np

def find_max_disparity_pair(similarity_matrix_skipgram, similarity_matrix_cbow, index_to_word):
    """
    Identify the pair of words with the largest disparity between two similarity matrices.
    """
    # Compute the absolute difference between the two similarity matrices
    disparity_matrix = np.abs(similarity_matrix_skipgram - similarity_matrix_cbow)

    # Find the indices of the maximum disparity
    max_disparity_index = np.unravel_index(np.argmax(disparity_matrix), disparity_matrix.shape)

    # Retrieve the words corresponding to these indices
    word1 = index_to_word.get(max_disparity_index[0], f"Word_{max_disparity_index[0]}")
    word2 = index_to_word.get(max_disparity_index[1], f"Word_{max_disparity_index[1]}")

    # Get the similarity scores for this pair in both models
    skipgram_sim = similarity_matrix_skipgram[max_disparity_index]
    cbow_sim = similarity_matrix_cbow[max_disparity_index]

    return (word1, word2, skipgram_sim, cbow_sim)


In [60]:
# Find the pair with the largest disparity
word1, word2, skipgram_sim, cbow_sim = find_max_disparity_pair(res, wizard_of_oz_similarity_matrix, index_to_word3)

print(f"Words: {word1} and {word2}")
print(f"Skipgram Similarity: {skipgram_sim:.2f}")
print(f"CBOW Similarity: {cbow_sim:.2f}")


Words: disqualification and 87
Skipgram Similarity: 1.00
CBOW Similarity: -0.86


In [59]:
# Find the pair with the largest disparity
word1, word2, skipgram_sim, cbow_sim = find_max_disparity_pair(res2, court_similarity_matrix, index_to_word4)

print(f"Words: {word1} and {word2}")
print(f"Skipgram Similarity: {skipgram_sim:.2f}")
print(f"CBOW Similarity: {cbow_sim:.2f}")

Words: embroil and Mont
Skipgram Similarity: 0.95
CBOW Similarity: -0.88
