# Zero Scrolls
The goal of this notebook is to explore the zero scrolls dataset.  
there are 2 versions: 
- public version 
- LC team's version


In [1]:
import json
import os
import sys
from datetime import datetime
import random

import numpy as np
import torch
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import set_seed as hf_set_seed

device = "cuda" if torch.cuda.is_available() else "cpu"
device

  from .autonotebook import tqdm as notebook_tqdm


'cuda'

In [2]:
!pip install sentencepiece accelerate


Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting accelerate
  Obtaining dependency information for accelerate from https://files.pythonhosted.org/packages/d9/92/2d3aecf9f4a192968035880be3e2fc8b48d541c7128f7c936f430d6f96da/accelerate-0.23.0-py3-none-any.whl.metadata
  Downloading accelerate-0.23.0-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece, accelerate
[0mSuccessfully installed accelerate-0.23.0

In [None]:
# to run the hyena chckpoint, you need to install dropout_add_layer_norm from flash attention module
# see this issue : https://github.com/HazyResearch/hyena-dna/issues/5

cd flash-attention/csrc/layer_norm
!pip install .
cd ../../..

In [None]:
# cd flash-attention/csrc/fused_dense_lib
# ! pip install .

## ZeroScrolls public
Benchmark [Homepage](https://www.zero.scrolls-benchmark.com/) and [github](https://github.com/tau-nlp/zero_scrolls/tree/main)  


Assuming the files have been downloaded and located in the `datasets` folder

In [None]:
zc_data_path='./datasets/zero-scrolls/public/'
zc_tasks = os.listdir(zc_data_path)
zc_tasks

In [None]:
zc_task=zc_tasks[4]
# load from  file
# data = load_dataset('json',data_files={'test':os.path.join(zc_data_path,zc_task,'test.jsonl'),'validation':os.path.join(zc_data_path,zc_task,'validation.jsonl')}) 
# download from huggingface
data = load_dataset("tau/zero_scrolls", zc_task)
data.keys()

In [None]:
print(len(data['test']))
print(len(data['validation']))

In [None]:
example=data['validation'][0]
example

In [None]:
print(example['input'])
print(example['output'])

In [None]:
tst_example = data['test'][0]
tst_example

In [None]:
print(tst_example['input'])

we see that the test set includes 500 samples and the validation set includes 20 samples.  
the difference between them is that the test set doesnt include the expected `output` (the value is `None`)

next, lets see how to generate a prediction for the example. based on the code provided by zeroscrolls (`run_hf_model.py`)

In [None]:
model_to_max_input_tokens = {
    "google/flan-t5-xxl": 8192,
    "google/flan-t5-xl": 8192,
    "google/flan-t5-large": 8192,
    "google/flan-t5-base": 8192,
    "google/flan-t5-small": 8192,
    "google/flan-ul2": 8192,
    "bigscience/T0pp": 8192,
}

def trim_doc_keeping_suffix(tokenizer, tokenized_input_full, example, suffix_index, max_tokens, device):
    seperator_and_suffix = f"{example['truncation_seperator'].strip()}\n\n{example['input'][suffix_index:].strip()}\n"
    tokenized_seperator_and_suffix = tokenizer(seperator_and_suffix, return_tensors="pt").input_ids.to(device)
    tokenized_input_trimmed = tokenized_input_full[:, :max_tokens - tokenized_seperator_and_suffix.shape[1]]
    tokenized_input = torch.cat([tokenized_input_trimmed, tokenized_seperator_and_suffix], dim=1)
    return tokenized_input


def process_model_input(tokenizer, example, max_tokens, device):
    tokenized_input_full = tokenizer(example["input"], return_tensors="pt").input_ids.to(device)
    if tokenized_input_full.shape[1] <= max_tokens:
        return tokenized_input_full

    seperator_and_query_text = example['truncation_seperator'] + example["input"][example['query_start_index']:]
    tokenized_seperator_and_query = tokenizer(seperator_and_query_text, return_tensors="pt").input_ids.to(device)
    input_without_query = example['input'][:example['query_start_index']]
    tokenized_input_without_query = tokenizer(input_without_query, return_tensors="pt").input_ids.to(device)
    tokenized_input_without_query = tokenized_input_without_query[:,
                                    :max_tokens - tokenized_seperator_and_query.shape[1]]

    tokenized_input = torch.cat([tokenized_input_without_query, tokenized_seperator_and_query], dim=1)
    return tokenized_input


In [None]:
# load a model
model_name='google/flan-t5-small'
tokenizer = T5Tokenizer.from_pretrained(model_name)
max_input_length = model_to_max_input_tokens[model_name]
model = T5ForConditionalGeneration.from_pretrained(model_name, device_map="auto",
                                                    torch_dtype=torch.float32)
model = model.eval()



In [None]:
example

In [None]:
# process an example
model_input = process_model_input(tokenizer, example, max_input_length, device)
prediction_token_ids = model.generate(model_input,
                                        max_length=1024,
                                        do_sample=False,
                                        top_p=0,
                                        top_k=0,
                                        temperature=1)
predicted_text = tokenizer.decode(prediction_token_ids[0], skip_special_tokens=True)



In [None]:
model_input.shape

In [None]:
prediction_token_ids.shape

In [None]:
predicted_text

### Using GPT-style (decoder only) model
in this section we'll try to use another model based on GPT. 
 

In [None]:
# TODO

### Using Hyena Checkpoint (134m) 
Now lets try to load the hyena checkpoint and see if we can feed the example in it:

In [None]:
from src.models.sequence.long_conv_lm import ConvLMHeadModel
from transformers import GPT2Tokenizer
import yaml 

In [None]:
def load_hyena_model(model_cfg, ckpt_path):
    config = yaml.load(open(model_cfg, 'r'), Loader=yaml.FullLoader)
    model = ConvLMHeadModel(**config['model_config'])
    state_dict = torch.load(ckpt_path, map_location='cpu')
    model.load_state_dict(state_dict)
    if config['tokenizer_name'] == 'gpt2':
        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    else:
        tokenizer = None 
    return model, tokenizer,config


In [None]:
model_cfg_file = 'configs/evals/hyena_small_150b.yaml'
ckpt_path='checkpoints/hyena_small_150b_tok.ckpt'



In [None]:
config = yaml.load(open(model_cfg_file, 'r'), Loader=yaml.FullLoader)
config

In [None]:
hmodel, htokenizer,config = load_hyena_model(model_cfg_file, ckpt_path)
max_input_length = config['model_config']['layer']['l_max']

hmodel = hmodel.to(device)
hmodel = hmodel.eval()        # is it needed ?


In [None]:
config

In [None]:
model_input = process_model_input(htokenizer, example, max_input_length, device)
model_input.shape

In [None]:
out = hmodel(model_input)
out

Note: the kernel crashed when trying to run the inference.  
to debug, check the format that is fed to the model with the Lambada dataset and compare to what you get here. 


## ZeroScrolls Long Context Team version
The data was downloaded from a link provided to us by LC team. it is saved in the `datasets` folder.
As for the scripts for evaluation, you can use the `zero_sc_eval` branch from NeMo and the scripts to run the benchmark are in the LC team gitlab repo.  

in the following, I'll try to run their benchmark on the public model (flan-t5-small) we used above  

They first generate the predictions, save it to a file and then they use the script to get the scrolls. 