<a href="https://colab.research.google.com/github/daichisaito-cs/LMSYS/blob/main/LMSYS_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  Setup

In [1]:
# @title Config
class Config:
    name = "baseline-001"
    only_inference = False

    model_name = "roberta-base"
    learning_rate = 1e-5
    max_length = 256
    epochs = 8
    batch_size = 16

    n_fold = 5
    trn_fold = [0, 1, 2, 3, 4]
    seed = 2022
    target_col = "target"
    debug = False

    # Colab Env
    upload_from_colab = True
    api_path = "/content/drive/Shareddrives/inSane/Workspace/kaggle/kaggle.json"
    drive_path = "/content/drive/Shareddrives/inSane/Workspace/kaggle/LMSYS_Challenge"

    # Kaggle Env
    kaggle_dataset_path = None

In [2]:
# @title Libraries
import os
import json
import warnings
import shutil
import logging
import joblib
import random
import datetime
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.auto import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

import tensorflow as tf
from tensorflow.keras import backend as K

In [3]:
# @title Utils

class Logger:
    """参考) https://github.com/ghmagazine/kagglebook/blob/master/ch04-model-interface/code/util.py"""
    def __init__(self, path):
        self.general_logger = logging.getLogger(path)
        stream_handler = logging.StreamHandler()
        file_general_handler = logging.FileHandler(os.path.join(path, 'Experiment.log'))
        if len(self.general_logger.handlers) == 0:
            self.general_logger.addHandler(stream_handler)
            self.general_logger.addHandler(file_general_handler)
            self.general_logger.setLevel(logging.INFO)

    def info(self, message):
        # display time
        self.general_logger.info('[{}] - {}'.format(self.now_string(), message))

    @staticmethod
    def now_string():
        return str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

def seed_everything(seed=42):
# 参考https://qiita.com/kaggle_grandmaster-arai-san/items/d59b2fb7142ec7e270a5
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

In [4]:
# @title Colab settings
COLAB = "google.colab" in sys.modules

if COLAB:
    print("This environment is Google Colab")

    # mount
    from google.colab import drive
    if not os.path.isdir("/content/drive"):
        drive.mount('/content/drive')

    # import library
    !pip install --quiet transformers
    !pip install --quiet iterative-stratification
    !pip install --quiet tensorflow-addons

    !pip install -q -U bitsandbytes
    !pip install -q -U transformers
    # !pip install transformers==4.40.0
    !pip install -q -U tokenizers
    !pip install -q -U peft

    # use kaggle api (need kaggle token)
    f = open(Config.api_path, 'r')
    json_data = json.load(f)
    os.environ["KAGGLE_USERNAME"] = json_data["username"]
    os.environ["KAGGLE_KEY"] = json_data["key"]

    # set dirs
    DRIVE = Config.drive_path
    EXP = (Config.name if Config.name is not None
           else get("http://172.28.0.2:9000/api/sessions").json()[0]["name"][:-6])
    INPUT = os.path.join(DRIVE, "Input")
    OUTPUT = os.path.join(DRIVE, "Output")
    SUBMISSION = os.path.join(DRIVE, "Submission")
    OUTPUT_EXP = os.path.join(OUTPUT, EXP)
    EXP_MODEL = os.path.join(OUTPUT_EXP, "model")
    EXP_FIG = os.path.join(OUTPUT_EXP, "fig")
    EXP_PREDS = os.path.join(OUTPUT_EXP, "preds")

    # make dirs
    for d in [INPUT, SUBMISSION, EXP_MODEL, EXP_FIG, EXP_PREDS]:
        os.makedirs(d, exist_ok=True)

    if not os.path.isfile(os.path.join(INPUT, "train.csv.zip")):
        # load dataset
        ! kaggle competitions download -c lmsys-chatbot-arena -p $INPUT

    # utils
    logger = Logger(OUTPUT_EXP)

else:
    print("This environment is Kaggle Kernel")

    # set dirs
    # INPUT = "../input/lmsys-chatbot-arena"
    INPUT = "../input"
    EXP, OUTPUT, SUBMISSION = "./", "./", "./"
    EXP_MODEL = os.path.join(EXP, "model")
    EXP_FIG = os.path.join(EXP, "fig")
    EXP_PREDS = os.path.join(EXP, "preds")

    # libraries
    !pip install -q -U bitsandbytes --no-index --find-links ../input/llm-detect-pip/
    !pip install -q -U transformers --no-index --find-links ../input/llm-detect-pip/
    !pip install -q -U tokenizers --no-index --find-links ../input/llm-detect-pip/
    !pip install -q -U peft --no-index --find-links ../input/llm-detect-pip/

    # copy dirs
    if Config.kaggle_dataset_path is not None:
        KD_MODEL = os.path.join(Config.kaggle_dataset_path, "model")
        KD_EXP_PREDS = os.path.join(Config.kaggle_dataset_path, "preds")
        shutil.copytree(KD_MODEL, EXP_MODEL)
        shutil.copytree(KD_EXP_PREDS, EXP_PREDS)

    # make dirs
    for d in [EXP_MODEL, EXP_FIG, EXP_PREDS]:
        os.makedirs(d, exist_ok=True)

    # utils
    logger = Logger(EXP)

This environment is Google Colab
Mounted at /content/drive
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.8/611.8 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m72.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m67.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
[?25hlmsys-chatbot-arena.zip: Skipping, found more recently modified local copy (use --force to force download)


In [5]:
# @title Other settings
# utils
warnings.filterwarnings("ignore")
sns.set(style='whitegrid')
seed_everything(seed=Config.seed)

# 2nd import
from transformers import AutoTokenizer, TFAutoModel, WarmUp
import tensorflow_addons as tfa

# libraries

In [8]:
from threading import Thread
import gc
import os
import io
import json
import random
import pickle
import zipfile
import datetime
import time

import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, LlamaModel, LlamaForSequenceClassification, BitsAndBytesConfig
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
from torch.cuda.amp import autocast
from IPython.display import display
import torch.nn.functional as F
import tokenizers
!pip list | grep torch

torch                            2.3.0+cu121
torchaudio                       2.3.0+cu121
torchsummary                     1.5.1
torchtext                        0.18.0
torchvision                      0.18.0+cu121


In [10]:
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

MODEL_NAME = os.path.join(INPUT, 'llama-3-8b-chat-hf') if COLAB else os.path.join(INPUT, 'llama-3/transformers/8b-chat-hf/1')
WEIGHTS_PATH = os.path.join(INPUT, 'lmsys-model/model')

# if COLAB:
#     MODEL_NAME = '/content/drive/Shareddrives/inSane/Workspace/kaggle/LMSYS_Challenge/Input/llama-3-8b-chat-hf'
#     WEIGHTS_PATH = '/content/drive/Shareddrives/inSane/Workspace/kaggle/LMSYS_Challenge/lmsys-model/model'
# else:
#     MODEL_NAME = '/kaggle/input/llama-3/transformers/8b-chat-hf/1'
#     WEIGHTS_PATH = '/kaggle/input/lmsys-model/model'
MAX_LENGTH = 1284
BATCH_SIZE = 8
DEVICE = torch.device("cuda")

# Prepare data

In [12]:
train = pd.read_csv(os.path.join(INPUT, "lmsys-chatbot-arena", "train.csv"))
test = pd.read_csv(os.path.join(INPUT, "lmsys-chatbot-arena", "test.csv"))
sample_sub = pd.read_csv(os.path.join(INPUT, "lmsys-chatbot-arena", "sample_submission.csv"))

In [14]:
# concatenate strings in list
def process(input_str):
    stripped_str = input_str.strip('[]')
    sentences = [s.strip('"') for s in stripped_str.split('","')]
    return  ' '.join(sentences)

test.loc[:, 'prompt'] = test['prompt'].apply(process)
test.loc[:, 'response_a'] = test['response_a'].apply(process)
test.loc[:, 'response_b'] = test['response_b'].apply(process)

display(sample_sub)
display(test.head(5))

# Prepare text for model
test['text'] = 'User prompt: ' + test['prompt'] +  '\n\nModel A :\n' + test['response_a'] +'\n\n--------\n\nModel B:\n'  + test['response_b']
print(test['text'][0])

Unnamed: 0,id,winner_model_a,winner_model_b,winner_tie
0,136060,0.333333,0.333333,0.333333
1,211333,0.333333,0.333333,0.333333
2,1233961,0.333333,0.333333,0.333333


Unnamed: 0,id,prompt,response_a,response_b
0,136060,"I have three oranges today, I ate an orange ye...",You have two oranges today.,You still have three oranges. Eating an orange...
1,211333,You are a mediator in a heated political debat...,Thank you for sharing the details of the situa...,Mr Reddy and Ms Blue both have valid points in...
2,1233961,How to initialize the classification head when...,When you want to initialize the classification...,To initialize the classification head when per...


User prompt: I have three oranges today, I ate an orange yesterday. How many oranges do I have?

Model A :
You have two oranges today.

--------

Model B:
You still have three oranges. Eating an orange yesterday does not affect the number of oranges you have today.


# Tokenize

In [15]:
tokenizer = AutoTokenizer.from_pretrained(os.path.join(INPUT, 'lmsys-model/tokenizer'))

tokens = tokenizer(test['text'].tolist(), padding='max_length',
                   max_length=MAX_LENGTH, truncation=True, return_tensors='pt')

INPUT_IDS = tokens['input_ids'].to(DEVICE, dtype=torch.int32)
ATTENTION_MASKS = tokens['attention_mask'].to(DEVICE, dtype=torch.int32)

# Move tensors to CPU and convert them to lists
input_ids_cpu = [tensor.cpu().tolist() for tensor in INPUT_IDS]
attention_masks_cpu = [tensor.cpu().tolist() for tensor in ATTENTION_MASKS]

data = pd.DataFrame()
data['INPUT_IDS'] = input_ids_cpu
data['ATTENTION_MASKS'] = attention_masks_cpu
data[:2]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Unnamed: 0,INPUT_IDS,ATTENTION_MASKS
0,"[1502, 10137, 25, 358, 617, 2380, 85138, 3432,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,"[1502, 10137, 25, 1472, 527, 264, 69030, 304, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


# Load llama model

In [16]:
# BitsAndBytes configuration
bnb_config =  BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16,
    bnb_8bit_use_double_quant=False)

# Load base model on GPU 0
device0 = torch.device('cuda:0')

base_model_0 = LlamaForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3,
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
    device_map='cuda:0')
base_model_0.config.pad_token_id = tokenizer.pad_token_id

Unused kwargs: ['bnb_8bit_compute_dtype', 'bnb_8bit_use_double_quant']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /content/drive/Shareddrives/inSane/Workspace/kaggle/LMSYS_Challenge/Input/llama-3-8b-chat-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Load weights

In [17]:
# LoRa configuration
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.10,
    bias='none',
    inference_mode=True,
    task_type=TaskType.SEQ_CLS,
    target_modules=['o_proj', 'v_proj'])

In [18]:
# Get peft
model_0 = get_peft_model(base_model_0, peft_config).to(device0)
#Load weights
model_0.load_state_dict(torch.load(WEIGHTS_PATH), strict=False)
model_0.eval()

#Trainable Parameters
model_0.print_trainable_parameters()

trainable params: 12,288 || all params: 7,511,764,992 || trainable%: 0.0002


In [19]:
gc.collect()

60

# Inference

In [20]:
def inference(df, model, device, batch_size=BATCH_SIZE):
    input_ids = torch.tensor(df['INPUT_IDS'].values.tolist(), dtype=torch.long)
    attention_mask = torch.tensor(df['ATTENTION_MASKS'].values.tolist(), dtype=torch.long)

    generated_class_a = []
    generated_class_b = []
    generated_class_c = []

    model.eval()

    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        batch_input_ids = input_ids[start_idx:end_idx].to(device)
        batch_attention_mask = attention_mask[start_idx:end_idx].to(device)

        with torch.no_grad():
            with autocast():
                outputs = model(
                    input_ids=batch_input_ids,
                    attention_mask=batch_attention_mask
                )

        probabilities = torch.softmax(outputs.logits, dim=-1).cpu().numpy()

        generated_class_a.extend(probabilities[:, 0])
        generated_class_b.extend(probabilities[:, 1])
        generated_class_c.extend(probabilities[:, 2])

    df['winner_model_a'] = generated_class_a
    df['winner_model_b'] = generated_class_b
    df['winner_tie'] = generated_class_c

    torch.cuda.empty_cache()

    return df

In [25]:
st = time.time()

# 単一のGPUデバイスを使用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# モデルをGPUに移動
model_0 = model_0.to(device)

# 全データを一度に処理
results = inference(data, model_0, device)

print(f"Processing complete. Total time: {time.time() - st}")

TARGETS = ['winner_model_a', 'winner_model_b', 'winner_tie']

sample_sub[TARGETS] = results[TARGETS]

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Processing complete. Total time: 99.08912372589111


In [26]:
llama_preds = data[TARGETS].values

# LGBM + tfidf

In [None]:
# TAG = 'lmsys-chatbot-arena'
# RUNPOD = os.path.exists('/workspace/')
# KAGGLE = not RUNPOD
# if KAGGLE:
#     print('kaggle')

In [27]:
try:
    import pandas as pd
except:
    !pip install -q kaggle
    !pip install -q pandas matplotlib scipy joblib scikit-learn lightgbm
    !pip install -q protobuf
    !pip install -q numba

In [None]:
# DATA = '/data/' if RUNPOD else 'data/' \
#         if not os.path.exists('/kaggle/') \
#             else '/kaggle/input/{}/'.format(TAG)

# if RUNPOD:
#     if not os.path.exists('~/.kaggle/kaggle.json'):
#         !mkdir -p ~/.kaggle
#         !cp /workspace/kaggle.json ~/.kaggle/kaggle.json
#         !chmod 600 /root/.kaggle/kaggle.json

#     if not os.path.exists('/workspace/' + TAG + '.zip'):
#         !kaggle competitions download $TAG -p /workspace/

#     if not os.path.exists('/data/'):
#         import zipfile
#         zipfile.ZipFile('/workspace/' + TAG + '.zip').extractall('/data/')

In [28]:
params = {}
if False:
    pass;
    params['subsample'] = 30
else:
    params['fold'] = -1


params['n_epochs'] = 1
params['n_lgb'] = 1
params['model'] = 'microsoft/deberta-v3-small'

In [29]:
# params = {}
FULL = params.get('fold', 0) < 0
N_FOLDS = int(params.get('n_folds', 3));
FOLD = int(params.get('fold', 0))
SEED = int(params.get('seed', 3))
SS = int(params.get('subsample', 1))

print(N_FOLDS, FOLD, SEED, SS)

3 -1 3 1


In [30]:
from sklearn.model_selection import StratifiedKFold

def get_folds(train):
    return list(StratifiedKFold(N_FOLDS, random_state = SEED, shuffle = True)\
                    .split(X = np.zeros(len(train)), y = train.iloc[:, -3:].idxmax(1)))

train_ids, test_ids = get_folds(train)[FOLD] if not FULL else [list(range(len(train))), []]
if SS > 1:
    train_ids, test_ids = train_ids[::SS], test_ids[::SS]

print(len(train_ids), len(test_ids));  assert set(train_ids) & set(test_ids) == set()

57477 0


In [31]:
TRAIN = False
INFER = True
SAVE = False

In [32]:
import lightgbm as lgb
from sklearn.feature_extraction.text import CountVectorizer

In [33]:
LGB = True
TRAIN_LGB = TRAIN and LGB and params.get('n_lgb', 1) > 0
INFER_LGB = not TRAIN and LGB

In [42]:
cvec  = pickle.load(open(os.path.join(INPUT, 'lsys-models-4/') + 'cvec.pkl', 'rb'))
ccvec = pickle.load(open(os.path.join(INPUT, 'lsys-models-4/') + 'ccvec.pkl', 'rb'))

In [43]:
def symlog(x):
    return (np.sign(x) * np.log1p(np.abs(x))).astype(np.float32)

def dense(x):
    x = np.asarray(x.astype(np.float32).todense())
    x = symlog(x)
    return x

def get_features(df):
    pfeat = np.hstack([dense(v.transform(df[c]))
                for v in [cvec, ccvec]
                    for c in ['prompt', ]])
    afeat = np.hstack([dense(v.transform(df[c]))
                for c in ['response_a', ]
                    for v in [cvec, ccvec]
                ])
    bfeat = np.hstack([dense(v.transform(df[c]))
                for c in ['response_b', ]
                    for v in [cvec, ccvec]
                ])

    v = np.hstack([
          afeat - bfeat, np.abs(afeat - bfeat),
        ])
    try:
        v = v / (len(all_vote_models) if len(df) < len(train) else 1)
    except:
        pass

    extras = []
    EXTRAS = ['\n', '\n\n', '.', ' ', '","']
    for e in EXTRAS:
        for c in ['prompt', 'response_a', 'response_b']:
            extras.append(df[c].str.count(e).values)

    extras.append(df[c].str.len())
    extras.append(df[c].str.split().apply(lambda x: len(x)))

    extras = np.stack(extras, axis = 1)
    extras = np.hstack([extras ** 0.5, np.log1p(extras)])
    return np.hstack([v, extras])

In [44]:
lgb_models = pickle.load(open(os.path.join(INPUT, 'lsys-models-4/') + 'lgb_models.pkl', 'rb'))

In [45]:
if INFER and params.get('n_lgb', 1) > 0:
    df = test
    yps = []; b = 1000
    for i in range(0, len(df), b):
        arr = get_features(df.iloc[i: i + b])
        ypms = []
        for model in lgb_models:
            ypms.append(model.predict_proba(arr))
        yps.append(np.stack(ypms).mean(0))
        print('.', end = '')

        if len(yps) % 2 == 0:
            gc.collect()
    print()

    yp = np.concatenate(yps)

.


In [46]:
lgb_preds = yp

# Blend predictions

In [47]:
lgb_wt = 0.3
preds = lgb_wt * lgb_preds + (1 - lgb_wt) * llama_preds

In [54]:
import numpy as np

# 確率値からone-hotベクトルへの変換
one_hot_preds = np.zeros_like(preds)
one_hot_preds[np.arange(len(preds)), preds.argmax(1)] = 1

# %%
# one_hot_predsをDataFrameに変換して出力する例
out_one_hot = pd.DataFrame(one_hot_preds, index=df.id, columns=train.columns[-3:])
display(out_one_hot.head())

# %%
# このDataFrameをCSVファイルに保存
# out_one_hot.to_csv('submission.csv')

Unnamed: 0_level_0,winner_model_a,winner_model_b,winner_tie
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
136060,0.0,1.0,0.0
211333,1.0,0.0,0.0
1233961,0.0,1.0,0.0


In [49]:
out = pd.DataFrame(preds, index=df.id, columns=train.columns[-3:])
display(out.head())

Unnamed: 0_level_0,winner_model_a,winner_model_b,winner_tie
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
136060,0.164693,0.551753,0.283749
211333,0.427557,0.296565,0.276195
1233961,0.273949,0.439712,0.286534


In [52]:
# make submission
print("# ---------- # Make Submission # ---------- #")
filename = Config.name + ".csv" if COLAB else "submission.csv"

out.to_csv(os.path.join(SUBMISSION, filename), index=False)

# ---------- # Make Submission # ---------- #


In [53]:
Config.name

'baseline-001'