# Experiemnt Notebook

In [2]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [8]:
import sentencepiece as spm
import torch

# Path to your dataset
dataset_path = "amazon_review.txt"

# Path to save the SentencePiece model
model_path = "amazon_review_model"

# Define SentencePiece parameters
vocab_size = 300  # You can adjust this based on your dataset and requirements
model_type = "unigram"  # You can use other model types like "bpe", "char", etc.

# Train SentencePiece model
spm.SentencePieceTrainer.train(input=dataset_path, model_prefix=model_path, vocab_size=vocab_size, model_type=model_type)

# Load trained SentencePiece model
sp = spm.SentencePieceProcessor()
sp.load(f"{model_path}.model")

# Tokenize a sentence using SentencePiece
sentence = "This is a sample review from the Amazon dataset."
tokens = sp.encode_as_pieces(sentence)
print(tokens)

# Convert tokens to IDs for neural network input
token_ids = sp.encode_as_ids(sentence)
print(token_ids)

# Convert token IDs to tensors for PyTorch
token_tensors = torch.tensor(token_ids)
print(token_tensors)


['▁This', '▁', 'is', '▁a', '▁s', 'a', 'mpl', 'e', '▁re', 'v', 'i', 'e', 'w', '▁fr', 'om', '▁the', '▁A', 'm', 'a', 'z', 'on', '▁', 'd', 'at', 'as', 'e', 't', '.']
[231, 3, 70, 13, 18, 22, 249, 16, 77, 166, 49, 16, 42, 51, 59, 5, 67, 117, 22, 0, 119, 3, 6, 15, 47, 16, 8, 11]
tensor([231,   3,  70,  13,  18,  22, 249,  16,  77, 166,  49,  16,  42,  51,
         59,   5,  67, 117,  22,   0, 119,   3,   6,  15,  47,  16,   8,  11])


In [9]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m48.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m53.3 MB/s[0m eta [36m0:00:0

In [11]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Load SentencePiece model
sp = spm.SentencePieceProcessor()
sp.load(f"{model_path}.model")

# Load pretrained BERT model and tokenizer
bert_model_name = "bert-base-uncased"
num_labels=2
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
bert_model = BertForSequenceClassification.from_pretrained(bert_model_name, num_labels=num_labels)

# Your SentencePiece tokenized sentence
sentence = "This is a sample review from the Amazon dataset."
tokens = sp.encode_as_pieces(sentence)

# Convert SentencePiece tokens to BERT tokens
bert_tokens = tokenizer.convert_tokens_to_ids(tokens)

# Convert to PyTorch tensors
input_ids = torch.tensor([bert_tokens])
attention_mask = torch.ones(input_ids.shape)

# Forward pass through BERT model
outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
logits = outputs.logits

# Perform classification on logits
predicted_class = torch.argmax(logits, dim=1)

print("Predicted class:", predicted_class.item())


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predicted class: 0


In [15]:
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.5839, -0.1824]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [12]:
tokens

['▁This',
 '▁',
 'is',
 '▁a',
 '▁s',
 'a',
 'mpl',
 'e',
 '▁re',
 'v',
 'i',
 'e',
 'w',
 '▁fr',
 'om',
 '▁the',
 '▁A',
 'm',
 'a',
 'z',
 'on',
 '▁',
 'd',
 'at',
 'as',
 'e',
 't',
 '.']

In [13]:
bert_tokens

[100,
 100,
 2003,
 100,
 100,
 1037,
 100,
 1041,
 100,
 1058,
 1045,
 1041,
 1059,
 100,
 18168,
 100,
 100,
 1049,
 1037,
 1062,
 2006,
 100,
 1040,
 2012,
 2004,
 1041,
 1056,
 1012]

In [14]:
attention_mask

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])

In [None]:
# Download LAMBADA from https://zenodo.org/record/2630551 and place in examples/lambada-dataset
!curl -O https://zenodo.org/record/2630551/files/lambada-dataset.tar.gz
!tar -xzf lambada-dataset.tar.gz --one-top-level
!ls lambada-dataset

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  319M  100  319M    0     0  15.6M      0  0:00:20  0:00:20 --:--:-- 18.4M
lambada_control_test_data_plain_text.txt  lambada-vocab-2.txt
lambada_development_plain_text.txt	  readme-up.txt
lambada_test_plain_text.txt		  train-novels.tar


In [12]:
import os
import pandas as pd

registry_path = os.path.join("..", "evals", "registry")
os.makedirs(os.path.join(registry_path, "data", "lambada"), exist_ok=True)



In [14]:

!ls ../evals/registry

data


In [None]:
registry_path

'../evals/registry'

In [None]:
def create_chat_prompt(text):
    return [
        {"role": "system", "content": "Please complete the passages with the correct next word."},
        {"role": "user", "content": text}
    ]

df = pd.read_csv('lambada-dataset/lambada_test_plain_text.txt', sep="\t", names=["text"])
df["text"] = df["text"].str.split(" ")
df["input"], df["ideal"] = df["text"].str[:-1].str.join(" ").apply(create_chat_prompt), df["text"].str[-1]
df = df[["input", "ideal"]]
df.to_json(os.path.join(registry_path, "data/lambada/samples.jsonl"), orient="records", lines=True)
display(df.head())

Unnamed: 0,input,ideal
0,"[{'role': 'system', 'content': 'Please complet...",signs
1,"[{'role': 'system', 'content': 'Please complet...",shane
2,"[{'role': 'system', 'content': 'Please complet...",insurance
3,"[{'role': 'system', 'content': 'Please complet...",helen
4,"[{'role': 'system', 'content': 'Please complet...",chains


In [None]:



eval_yaml = """
lambada:
  id: lambada.test.v1
  metrics: [accuracy]
lambada.test.v1:
  class: evals.elsuite.basic.match:Match
  args:
    samples_jsonl: lambada/samples.jsonl
""".strip()
with open(os.path.join(registry_path, "evals", "lambada.yaml"), "w") as f:
    f.write(eval_yaml)

FileNotFoundError: ignored

In [15]:
eval_yaml

'lambada:\n  id: lambada.test.v1\n  metrics: [accuracy]\nlambada.test.v1:\n  class: evals.elsuite.basic.match:Match\n  args:\n    samples_jsonl: lambada/samples.jsonl'

In [16]:
!oaieval gpt-3.5-turbo lambada --max_samples 20

/bin/bash: line 1: oaieval: command not found


In [17]:
!pip install oaieval

[31mERROR: Could not find a version that satisfies the requirement oaieval (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for oaieval[0m[31m
[0m

In [None]:
import torch
from transformers import AutoModel, AutoTokenizer

# Load the model and tokenizer
url = "microsoft/BiomedVLP-BioViL-T"
tokenizer = AutoTokenizer.from_pretrained(url, trust_remote_code=True)
model = AutoModel.from_pretrained(url, trust_remote_code=True)

# Input text prompts describing findings.
# The order of prompts is adjusted to capture the spectrum from absence of a finding to its temporal progression.
text_prompts = ["No pleural effusion or pneumothorax is seen.",
                "There is no pneumothorax or pleural effusion.",
                "The extent of the pleural effusion is reduced.",
                "The extent of the pleural effusion remains constant.",
                "Interval enlargement of pleural effusion."]

# Tokenize and compute the sentence embeddings
with torch.no_grad():
    tokenizer_output = tokenizer.batch_encode_plus(batch_text_or_text_pairs=text_prompts,
                                                   add_special_tokens=True,
                                                   padding='longest',
                                                   return_tensors='pt')
    embeddings = model.get_projected_text_embeddings(input_ids=tokenizer_output.input_ids,
                                                 attention_mask=tokenizer_output.attention_mask)

    # Compute the cosine similarity of sentence embeddings obtained from input text prompts.
    sim = torch.mm(embeddings, embeddings.t())
