# Install packages

In [None]:
!pip install transformers==4.32.0 accelerate tiktoken einops scipy transformers_stream_generator==0.0.4 peft deepspeed dataset SentencePiece

Collecting transformers==4.32.0
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
Collecting tiktoken
  Downloading tiktoken-0.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m67.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting einops
  Downloading einops-0.7.0-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting transformers_stream_generator==0.0.4
  Downloading transformers-stream-generator-0.0.4.tar.gz (12 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting peft
  Downloading peft-0.6.2-py3-none-any.whl (174 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.7/174.7 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m


# Loading dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("Stevross/mmlu", "econometrics")

# Define utilities

In [None]:
from torch.utils.data import Dataset
# Question Answering
class KeyDatasetQA(Dataset):
    def __init__(self, dataset: Dataset, question: str, choices: str):
        self.dataset = dataset
        self.question = question
        self.choices = choices

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        output = ''
        for j, choice in enumerate(self.dataset[self.choices][i]):
            output += f'{str(j)}: {choice} '
        return self.dataset[self.question][i], output

In [None]:
from torch.utils.data import Dataset
# Text generation
class KeyDatasetTG(Dataset):
    def __init__(self, dataset: Dataset, question: str, choices: str):
        self.dataset = dataset
        self.question = question
        self.choices = choices

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        output = self.dataset[self.question][i]
        for j, choice in enumerate(self.dataset[self.choices][i]):
            output += f' {str(j)}: {choice} '
        output += ". The answer is "
        return output

In [None]:
# Return: accuracy
def evaluate(answers: list, model_answers: list):
  correct = 0
  for i in range(len(model_answers)):
    if str(answers[i]) == model_answers[i]:
      correct += 1
  return correct/len(model_answers)

# Models

## vihangd/shearedplats-2.7b-v2

In [None]:
from transformers import pipeline

shearedplats = pipeline('text-generation', "vihangd/shearedplats-2.7b-v2", max_new_tokens = 1)

Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

In [None]:
question = "Which one of the following is the most appropriate definition of a 99% confidence interval? 0: 99% of the time in repeated samples, the interval would contain the true value of the parameter, 1: 99% of the time in repeated samples, the interval would contain the estimated value of the parameter, 2: 99% of the time in repeated samples, the null hypothesis will be rejected, 3: 99% of the time in repeated samples, the null hypothesis will not be rejected when it was false .The answer is "
shearedplats(question)

[{'generated_text': 'Which one of the following is the most appropriate definition of a 99% confidence interval? 0: 99% of the time in repeated samples, the interval would contain the true value of the parameter, 1: 99% of the time in repeated samples, the interval would contain the estimated value of the parameter, 2: 99% of the time in repeated samples, the null hypothesis will be rejected, 3: 99% of the time in repeated samples, the null hypothesis will not be rejected when it was false .The answer is 2'}]

In [None]:
from transformers.pipelines.pt_utils import KeyDataset
from tqdm.auto import tqdm

In [None]:
model_out = []
model_answer = []
for out in tqdm(shearedplats(KeyDatasetTG(dataset["test"], 'question', 'choices'))):
  model_out.append(out)
  model_answer.append(out[0]['generated_text'][-1])
  print(out[0]['generated_text'][-1])

  0%|          | 0/114 [00:00<?, ?it/s]

2
1
2
0
2
2
1
0
1
2
1
1
2
2
1
2
2
1
1
3
0
2
2
1
2
2
1
1
2
0
2
2
2
0
2
1
2
1
1
2
1
2
1
0
1
1
2
1
1
2
0
1
2
2
2
1
2
2
2
2
1
0
2
3
1
2
1
2
1
2
2
2
1
2
0
2
2
2
1
3
1
1
3
0
2
1
1
2
0
1
2
0
0
2
2
2
3
2
0
0


0
1
2
2
1
0
2
0
1
3
2
1
2


In [None]:
evaluate(dataset['test']['answer'], model_answer)

0.2543859649122807

## vihangd/smartyplats-3b-v2

In [None]:
!pip install SentencePiece

Collecting SentencePiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SentencePiece
Successfully installed SentencePiece-0.1.99


In [None]:
from transformers import pipeline

smartyplats = pipeline('text-generation', "vihangd/smartyplats-3b-v2", device_map="auto", use_fast = False, max_new_tokens = 1)

Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]



In [None]:
question = "Which one of the following is the most appropriate definition of a 99% confidence interval? 0: 99% of the time in repeated samples, the interval would contain the true value of the parameter, 1: 99% of the time in repeated samples, the interval would contain the estimated value of the parameter, 2: 99% of the time in repeated samples, the null hypothesis will be rejected, 3: 99% of the time in repeated samples, the null hypothesis will not be rejected when it was false .The answer is "
smartyplats(question)

[{'generated_text': 'Which one of the following is the most appropriate definition of a 99% confidence interval? 0: 99% of the time in repeated samples, the interval would contain the true value of the parameter, 1: 99% of the time in repeated samples, the interval would contain the estimated value of the parameter, 2: 99% of the time in repeated samples, the null hypothesis will be rejected, 3: 99% of the time in repeated samples, the null hypothesis will not be rejected when it was false .The answer is 2'}]

In [None]:
from transformers.pipelines.pt_utils import KeyDataset
from tqdm.auto import tqdm

In [None]:
model_out = []
model_answer = []
for out in tqdm(smartyplats(KeyDatasetTG(dataset["test"], 'question', 'choices'))):
  model_out.append(out)
  model_answer.append(out[0]['generated_text'][-1])
  print(out[0]['generated_text'][-1])

  0%|          | 0/114 [00:00<?, ?it/s]

2
3
2
2
2
2
2
2
2
2
3
3
2
1
2
3
2
1
1
2
2
2
3
2
1
2
2
1
3
2
2
3
2
2
3
2
3
1
2
1
3
3
2
2
2
1
4
2
2
3
2
1
1
3
3
2
2
2
2
1
2
1
3
2
2
2
2
3
3
3
3
3
2
2
2
1
3
2
2
1
3
3
2
2
2
1
1
2
2
2
3
2
3
3
3
2
2
3
2
0
3
1
2
3
2
2
2
3
2
3
2
3
2
3


In [None]:
# Return: accuracy
def evaluate(answers: list, model_answers: list):
  correct = 0
  for i in range(len(model_answers)):
    if str(answers[i]) == model_answers[i]:
      correct += 1
  return correct/len(model_answers)

In [None]:
evaluate(dataset['test']['answer'], model_answer)

0.23684210526315788

## timpal0l/mdeberta-v3-base-squad2

In [None]:
from transformers import pipeline

qa_model = pipeline("question-answering", "timpal0l/mdeberta-v3-base-squad2", max_length = 1)

config.json:   0%|          | 0.00/879 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/453 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

{'score': 0.00012132546544307843, 'start': 0, 'end': 1, 'answer': '0'}

In [None]:
from transformers.pipelines.pt_utils import KeyDataset
from tqdm.auto import tqdm

In [None]:
for q, c in KeyDatasetQA(dataset["test"], 'question', 'choices'):
  out = qa_model(q, c)
  print(out)

{'score': 0.0001011548301903531, 'start': 0, 'end': 2, 'answer': '0:'}
{'score': 0.15939778089523315, 'start': 325, 'end': 367, 'answer': ' PP tests have good power in small samples'}
{'score': 0.4679064154624939, 'start': 115, 'end': 138, 'answer': ' symmetrical about zero'}
{'score': 0.0012031022924929857, 'start': 0, 'end': 2, 'answer': '0:'}
{'score': 6.35853098174266e-07, 'start': 103, 'end': 161, 'answer': ' In theory, the sample could be larger than the population'}
{'score': 0.001884917262941599, 'start': 286, 'end': 340, 'answer': ' null hypothesis of zero autocorrelation coefficients.'}
{'score': 2.0826997570111416e-05, 'start': 33, 'end': 91, 'answer': ' Includes as few variables as possible to explain the data'}
{'score': 4.8342636560505525e-09, 'start': 242, 'end': 253, 'answer': ' fat-tailed'}
{'score': 0.005365314427763224, 'start': 0, 'end': 24, 'answer': '0: (ii) and (iv) only 1:'}
{'score': 0.15942007303237915, 'start': 0, 'end': 15, 'answer': '0: Less than -1'}
{'sco