In [1]:
# !pip install transformers
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
# !pip install sentence-transformers
# !pip install datasets

In [14]:
import numpy as np
import pandas as pd
import ast
import warnings
import scipy
import matplotlib.pyplot as plt
import spacy
import difflib
import tqdm
import json
import pickle
import logging
import itertools

logging.basicConfig(
    format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO
)

from transformers import DataCollatorForLanguageModeling
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter

In [6]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-large-uncased")
model = AutoModelForCausalLM.from_pretrained("google-bert/bert-large-uncased")

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


In [4]:
dfn = pd.read_csv("lmsys-chatbot-arena/aux_files/dfn.csv")
unique_ids = dfn.id.unique()
tagger = spacy.load("en_core_web_lg")
dfn.head(3)

Unnamed: 0,id,prompt,model_a_answer,model_b_answer,winner
0,30192,Is it morally right to try to have a certain p...,The question of whether it is morally right to...,"As an AI, I don't have personal beliefs or opi...",A
1,53567,What is the difference between marriage licens...,A marriage license is a legal document that al...,A marriage license and a marriage certificate ...,B
2,53567,How can I get both of them as quick as possibl...,If you want to get both a marriage license and...,"In California, here are the general steps to o...",B


In [None]:
def feature_extract(txt , limit , tokenizer , tagger , target_tags = ["NOUN" , "VERB", "PROPN"]):
    reconstructed_str = ""
    doc = tagger(txt)
    txt_len = len(tokenizer(txt).encodings[0].tokens)

    if txt_len > limit:
        for token in doc:
            if token.pos_ in target_tags:
                if len(reconstructed_str) < 1:
                    reconstructed_str = token.text
                else:
                    reconstructed_str += " " + token.text
            if (token.pos_ == "SYM"):
                if len(reconstructed_str) < 1:
                    reconstructed_str = token.text
                else:
                    reconstructed_str += token.text
        return reconstructed_str
    else:
        return reconstructed_str

In [13]:
id_list = []
id_strings = []

for _id in tqdm.tqdm(unique_ids):
    id_df = dfn[dfn.id == _id]
    id_list.append(_id)
    id_str = ""
    
    for row in id_df.iterrows():
        prompt = str(row[1]['prompt'])
        
        model_a_answer = str(row[1]['model_a_answer'])
        model_b_answer = str(row[1]['model_b_answer'])
        winner = row[1]['winner']

        if winner == "A":
            id_str += prompt + "\n" + model_a_answer +'\n'
        if winner == "B":
            id_str += prompt + "\n" + model_b_answer +'\n'
            
    id_strings.append(id_str)

100%|███████████████████████████████████| 56759/56759 [00:06<00:00, 8864.94it/s]


In [53]:
tokenized_id_strs = []
for e in tqdm.tqdm(id_strings):
    tokenized_id_strs.append(tokenizer([e]))

100%|███████████████████████████████████| 56759/56759 [00:24<00:00, 2274.43it/s]


In [58]:
block_size = 256
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
grouped_texts = []
group_texts(tokenized_id_strs[0])