In [7]:
from sentence_transformers import SentenceTransformer, InputExample
from sentence_transformers.cross_encoder import CrossEncoder

In [2]:
model = SentenceTransformer('EleutherAI/pythia-70m')

Downloading (…)055d2/.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading (…)3691f055d2/README.md:   0%|          | 0.00/13.4k [00:00<?, ?B/s]

Downloading (…)91f055d2/config.json:   0%|          | 0.00/567 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Downloading (…)055d2/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

No sentence-transformers model found with name /Users/g.salazar.2/.cache/torch/sentence_transformers/EleutherAI_pythia-70m. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /Users/g.salazar.2/.cache/torch/sentence_transformers/EleutherAI_pythia-70m were not used when initializing GPTNeoXModel: ['embed_out.weight']
- This IS expected if you are initializing GPTNeoXModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPTNeoXModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0

In [6]:
from torch.utils.data import Dataset, IterableDataset

class PairDataset:
    def __init__(self, filepath):
        self.filepath = filepath
        self.examples = []

    def __iter__(self):
        print("open", self.filepath)
        with gzip.open(self.filepath, 'rt') as fIn:
            for line in fIn:
                example = self.get_example(json.loads(line))
                if example is not None:
                    self.examples.append(example)
                    yield example

        while True:
            random.shuffle(self.examples)
            for ex in self.examples:
                yield ex
                
    def get_example(self, raw_example):
        if isinstance(raw_example, dict):
            return InputExample(texts=[raw_example['query'], random.choice(raw_example['pos'])], label=1)
        else:
            return InputExample(texts=[raw_example[0], raw_example[1]], label=1)


class RedditTitleDataset(PairDataset):
    def get_example(self, raw_example):
        return [self.clean_title(raw_example['title']), raw_example['body']]


    def clean_title(self, text):
        text = text.replace("&amp;", "&").strip()
        if text.startswith("["):
            text = re.sub("^\[[a-zA-Z0-9]+\]", "", text).strip()

        if text.endswith("]"):
            text = re.sub("\[[a-zA-Z0-9\.]+\]$", "", text).strip()

        if text.startswith("/r"):
            text = re.sub("^/[a-zA-Z0-9/]+[;,: \-]+", "", text).strip()

        return text


class StackExchangeTitleBodyDataset(PairDataset):
    def get_example(self, raw_example):
        return raw_example['texts']


class MultiDataset(IterableDataset):
    def __init__(self, filepaths, num_samples):
        self.num_samples = num_samples
        self.datasets = []
        self.data_iterators = []

        for filepath in filepaths:
            if 'reddit_title_text' in filepath:
                dataset = RedditTitleDataset(filepath)
            elif 'stackexchange_archive/jsonl' in filepath:
                dataset = StackExchangeTitleBodyDataset(filepath)
            else:
                dataset = PairDataset(filepath)
            self.datasets.append(dataset)
            self.data_iterators.append(iter(dataset))

    def __len__(self):
        return self.num_samples

    def __iter__(self):
        while True:
            for dataset in self.data_iterators:
                yield next(dataset)

            random.shuffle(self.data_iterators)

    def delete_examples_cache(self):
        for dataset in self.datasets:
            dataset.examples = []

In [65]:
long(random.randint(0, 3))

NameError: name 'long' is not defined

In [40]:
import gzip
import json
import random


def get_example(raw_example):
    if isinstance(raw_example, dict):
        return InputExample(texts=[raw_example['query'], random.choice(raw_example['pos'])], label=random.randint(0, 3))
    else:
        return InputExample(texts=[raw_example[0], raw_example[1]], label=random.randint(0, 3))
        
def load_pair_dataset(filepath):
    examples=[]
    with gzip.open(filepath, 'rt') as fIn:
            for line in fIn:
                example = get_example(json.loads(line))
                examples.append(example)
    return examples

full_set = load_pair_dataset("/Users/g.salazar.2/git/trec_dh/gustavo/gooaq_pairs.jsonl.gz")

In [42]:
len(full_set)

3012496

In [41]:
from sklearn.model_selection import train_test_split
(train_set, test_set) = train_test_split(full_set, test_size=0.33, random_state=42, shuffle=False)

In [43]:
len(train_set), len(test_set)

(2018372, 994124)

In [74]:
import logging
from torch.utils.data import DataLoader
from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator
from sentence_transformers.cross_encoder.evaluation import CERerankingEvaluator
import math
from sentence_transformers import LoggingHandler, util
import torch.nn
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
logger = logging.getLogger(__name__)

train_batch_size = 4
num_epochs = 1
num_labels = 4
max_length = 512
evaluation_steps = 100
lr = 7e-6

train_dataloader = DataLoader(train_set, shuffle=True, batch_size=train_batch_size)
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up

default_activation_function = torch.nn.Identity()

model = CrossEncoder('EleutherAI/pythia-70m', num_labels=num_labels, 
                     tokenizer_args={'pad_token': '[PAD]'}, 
                     default_activation_function=default_activation_function)
# evaluator = CEBinaryClassificationEvaluator.from_input_examples(test_set, name='trec-ev')
evaluator = CERerankingEvaluator(test_set, name='train-eval')

# Configure the training

logger.info("Warmup-steps: {}".format(warmup_steps))
loss_fct=torch.nn.L1Loss()

model.config.pad_token_id = model.tokenizer.pad_token_id
# Train the model
model.fit(train_dataloader=train_dataloader,
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=evaluation_steps,
          warmup_steps=warmup_steps,
          optimizer_params={'lr': lr},
          output_path="model_saved")



Some weights of the model checkpoint at EleutherAI/pythia-70m were not used when initializing GPTNeoXForSequenceClassification: ['embed_out.weight']
- This IS expected if you are initializing GPTNeoXForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPTNeoXForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-70m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated w

2023-06-04 03:04:36 - Use pytorch device: cpu
2023-06-04 03:04:36 - Warmup-steps: 50460


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/504593 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


2023-06-04 03:05:45 - CERerankingEvaluator: Evaluating the model on train-eval dataset in epoch 0 after 100 steps:


TypeError: 'InputExample' object is not subscriptable