# Load kaggle

In [None]:
# JAX는 기본적으로 멀티스레딩을 사용,
# os.fork()는 멀티스레딩 코드와 호환되지 않기 때문에 데드락(deadlock)이 발생

import multiprocessing as mp

mp.set_start_method('spawn')

In [None]:
! pwd

In [None]:
import pandas as pd
import numpy as np
train = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/train.csv')
train.head()

# Preprocessing for Classifier

In [None]:
train = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/train.csv')
train.head()

In [None]:
clf_train = train[['prompt','response_a','response_b','winner_model_a','winner_model_b','winner_tie']]

In [None]:
clf_train.loc[:, "prompt"] = clf_train["prompt"].apply(lambda x: json.loads(x)[0])
clf_train.loc[:, "response_a"] = clf_train["response_a"].apply(lambda x: json.loads(x)[0])
clf_train.loc[:, "response_b"] = clf_train["response_b"].apply(lambda x: json.loads(x)[0])

In [None]:
clf_train = clf_train.dropna()
clf_train = clf_train.reset_index(drop = True)

In [None]:
# clf_train['new_text'] = [ "### prompt: "+clf_train['prompt'][x]+" ### response_a: "+clf_train['response_a'][x]+" ### response_b: "+clf_train['response_b'][x] for x in range(len(clf_train)) ]

In [None]:
clf_train['target'] = [[clf_train['winner_model_a'][x],clf_train['winner_model_b'][x],clf_train['winner_tie'][x]] for x in range(len(clf_train)) ]

In [None]:
clf_train = clf_train[['prompt','response_a','response_b','target']]

In [None]:
clf_train.head()

In [None]:
def cl(x):
  if x == [1,0,0]:
    return 0
  elif x == [0,1,0]:
    return 1
  else :
    return 2

clf_train['labels'] = clf_train['target'].apply(lambda x : cl(x))

In [None]:
clf_train['p_len'] = clf_train['prompt'].apply(lambda x : len(x))
clf_train['a_len'] = clf_train['response_a'].apply(lambda x : len(x))
clf_train['b_len'] = clf_train['response_b'].apply(lambda x : len(x))

In [None]:
clf_train['len'] = clf_train['p_len'] + clf_train['a_len']+ clf_train['b_len']

In [None]:
sample_df = clf_train.sample(int(len(clf_train)*n_sample), weights = "len", random_state=seed).reset_index(drop=True)

In [None]:
sample_df

In [None]:
t_dat, v_dat = train_test_split(sample_df, test_size=0.2, random_state=42, stratify = sample_df['labels'])

t_dat = t_dat.reset_index(drop=True)
v_dat = v_dat.reset_index(drop=True)

In [None]:
t_dat = t_dat.drop( labels= 'target' , axis = 1)
v_dat = v_dat.drop( labels= 'target' , axis = 1)

In [None]:
t_dat.head()

In [None]:
np.unique(t_dat['labels'])

In [None]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.prompt = df['prompt']
        self.response_a = df['response_a']
        self.response_b = df['response_b']
        self.max_len = max_len
        self.targets = df.get('labels', None)

    def __len__(self):
        return len(self.prompt)

    def __getitem__(self, index):
        prompt = str(self.prompt[index])
        response_a = str(self.response_a[index])
        response_b = str(self.response_b[index])

        prompt_len = len(self.tokenizer("##prompt: " + prompt, add_special_tokens=True)['input_ids'])
        response_a_len = len(self.tokenizer("##response_a: " + response_a, add_special_tokens=True)['input_ids'])
        response_b_len = len(self.tokenizer("##response_b: " + response_b, add_special_tokens=True)['input_ids'])

        final_prompt_len = min(self.max_len, prompt_len)
        final_a_len = min(self.max_len, response_a_len)
        final_b_len = min(self.max_len, response_b_len)

        prompt_token = self.tokenizer("##prompt: " + prompt, add_special_tokens=True, max_length=final_prompt_len, truncation=True,padding='max_length', return_attention_mask=True, return_tensors='pt')
        response_a_token = self.tokenizer("##response_a: " + response_a, add_special_tokens=True, max_length=final_a_len, truncation=True,padding='max_length', return_attention_mask=True, return_tensors='pt')
        response_b_token = self.tokenizer("##response_b: " + response_b, add_special_tokens=True, max_length=final_b_len, truncation=True,padding='max_length', return_attention_mask=True, return_tensors='pt')

        input_ids = torch.cat([prompt_token['input_ids'], response_a_token['input_ids'], response_b_token['input_ids']], dim=1)
        attention_mask = torch.cat([prompt_token['attention_mask'], response_a_token['attention_mask'], response_b_token['attention_mask']], dim=1)

        if self.targets is not None:
            labels = torch.LongTensor([self.targets[index]])
            return {'input_ids': input_ids.flatten(), 'attention_mask': attention_mask.flatten(), 'labels': labels}
        else:
            return {'input_ids': input_ids.flatten(), 'attention_mask': attention_mask.flatten()}

In [None]:
def custom_collate_fn(batch, tokenizer):

    input_ids = [item['input_ids'] for item in batch]
    attention_masks = [item['attention_mask'] for item in batch]
    labels = torch.cat([item['labels'] for item in batch], dim=0) if 'labels' in batch[0] else None

    # Find the maximum length of the sequences in the batch
    max_len = max([input_id.size(0) for input_id in input_ids])

    # Re-tokenize with the new max length
    new_input_ids = []
    new_attention_masks = []

    for item in batch:
        input_ids = item['input_ids'][:max_len]
        attention_mask = item['attention_mask'][:max_len]

        new_input_ids.append(input_ids)
        new_attention_masks.append(attention_mask)

    new_input_ids = pad_sequence(new_input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    new_attention_masks = pad_sequence(new_attention_masks, batch_first=True, padding_value=0)

    output = {
    'input_ids': new_input_ids,
    'attention_mask': new_attention_masks}

    if labels is not None:
        output['labels'] = labels

    return output

In [None]:
def create_dataloaders(df,tokenizer,max_len, batch_size, shuffle = True):
    dataloader = DataLoader(
        CustomDataset(df, tokenizer, max_len), shuffle=shuffle, batch_size=batch_size , collate_fn=lambda x: custom_collate_fn(x, tokenizer)
    )
    return dataloader

In [None]:
# tokenizer.decode([2])

# Test Inference

In [None]:
# model = AutoModel.from_pretrained(model_name, torch_dtype=torch.float16)
# model = quantize_model(model)
# for idx, layer in enumerate(model.layers):
#     replace_attention_module(model.config,layer,idx)
# model = LoraModelForClassification(model)
# model.load_state_dict(torch.load(model_path))
# device = "cuda:0"
# model.to(device)
# model.eval()

In [None]:
test = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/test.csv')
len(test)

In [None]:
import json
test["prompt"] = test["prompt"].apply(lambda x: json.loads(x)[0])
test["response_a"] = test["response_a"].apply(lambda x: json.loads(x)[0])
test["response_b"] = test["response_b"].apply(lambda x: json.loads(x)[0])

In [None]:
# test['new_text'] = [ "### prompt: "+test['prompt'][x]+" ### response_a: "+test['response_a'][x]+" ### response_b: "+test['response_b'][x] for x in range(len(test)) ]

In [None]:
test.head()

In [None]:
test_0 = test[:len(test)//2].reset_index(drop=True)
test_1 = test[len(test)//2:].reset_index(drop=True)

In [None]:
from torch.cuda.amp import autocast

def infer(model, dataloader, device):
#     model = nn.DataParallel(model)  # Wrap the model with DataParallel
#     model.to(device)
    model.eval()

    target_list = []

    for batch in dataloader:
        with torch.no_grad():
            with autocast():
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                _,logits = model(input_ids=input_ids, attention_mask=attention_mask)
                softmax_logits = torch.nn.functional.softmax(logits, dim=1)
                target_list.append(softmax_logits)

    return target_list

In [None]:
from threading import Thread

gpu0 = "cuda:0"
gpu1 = "cuda:1"

model0 = AutoModel.from_pretrained(model_name, torch_dtype=torch.float16)
model0 = quantize_model(model0)
for idx, layer in enumerate(model0.layers):
    replace_attention_module(model0.config,layer,idx)
model0 = LoraModelForClassification(model0)
model0.load_state_dict(torch.load(model_path))
model0 = model0.to(gpu0)

model1 = AutoModel.from_pretrained(model_name, torch_dtype=torch.float16)
model1 = quantize_model(model1)
for idx, layer in enumerate(model1.layers):
    replace_attention_module(model1.config,layer,idx)
model1 = LoraModelForClassification(model1)
model1.load_state_dict(torch.load(model_path))
model1 = model1.to(gpu1)


tokenizer0 = AutoTokenizer.from_pretrained(model_name)

if tokenizer0.pad_token is None:
    tokenizer0.pad_token = tokenizer0.eos_token
tokenizer0.padding_side = "right"  # Fix weird overflow issue with fp16 training

tokenizer1 = AutoTokenizer.from_pretrained(model_name)

if tokenizer1.pad_token is None:
    tokenizer1.pad_token = tokenizer1.eos_token
tokenizer1.padding_side = "right"  # Fix weird overflow issue with fp16 training

test_dataloader0 = create_dataloaders(test_0,tokenizer0,test_max_len,test_batch_size, shuffle = False)
test_dataloader1 = create_dataloaders(test_1,tokenizer1,test_max_len,test_batch_size, shuffle = False)

def run_inference(model, dataloader, device, results, index):
    results[index] = infer(model, dataloader, device)

results = {}

process0 = Thread(target=run_inference, args=(model0, test_dataloader0, gpu0, results,0))
process1 = Thread(target=run_inference, args=(model1, test_dataloader1, gpu1, results,1))

# Start the processes
process0.start()
process1.start()

# Wait for both processes to finish
process0.join()
process1.join()

In [None]:
# target_list = []

# for data in valid_dataloader:
#   with torch.no_grad():
#     input_ids = data['input_ids'].to(device = device, dtype = torch.long)
#     attention_mask = data['attention_mask'].to(device = device, dtype = torch.long)
#     _, logits = model(input_ids, attention_mask)

# target_list

In [None]:
device = 'cuda:0'  # 이동할 장치 선택
for k, v in results.items():
    for i in range(len(v)):
        results[k][i] = v[i].to(device)

# 딕셔너리의 값을 하나로 합치기
target_list = torch.cat([torch.cat(v, dim=0) for v in results.values()], dim=0)

In [None]:
sub = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/sample_submission.csv')
# sub

In [None]:
df_list = []
for tensor in target_list:
    df = pd.DataFrame(tensor.unsqueeze(0).detach().cpu().numpy(), columns=['winner_model_a', 'winner_model_b', 'winner_tie'])
    df_list.append(df)

combined_df = pd.concat(df_list, axis=0, ignore_index=True)

sub = sub.set_index(pd.Index(combined_df.index))

final_df = pd.concat([sub[['id']], combined_df], axis=1)
# final_df

In [None]:
def delete_files_and_folders(path):
    # 경로가 존재하는지 확인
    if not os.path.exists(path):
        print(f"Error: {path} does not exist.")
        return

    # 경로 내의 모든 파일 및 폴더를 탐색
    for root, dirs, files in os.walk(path, topdown=False):
        # 파일 삭제
        for name in files:
            if name == "submission.csv":
                print(f"Skipping file: {os.path.join(root, name)}")
                continue
            file_path = os.path.join(root, name)
            print(f"Deleting file: {file_path}")
            os.remove(file_path)

#         # 폴더 삭제
#         for name in dirs:
#             folder_path = os.path.join(root, name)
#             print(f"Deleting folder: {folder_path}")
#             shutil.rmtree(folder_path)

    print(f"All files and folders in {path} have been deleted.")

# 예제 경로
path_to_delete = "/kaggle/working/"

# 파일 및 폴더 삭제 함수 호출
delete_files_and_folders(path_to_delete)

In [None]:
final_df.to_csv('submission.csv', index=False)

In [None]:
# GPU 메모리 비우기
def clear_gpu_memory():
    torch.cuda.empty_cache()
    gc.collect()

# 학습 후 GPU 메모리 비우기
clear_gpu_memory()