In [None]:
!pip install peft
!pip install transformers
!pip install datasets

In [2]:
import torch
from peft import PeftModel , LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM
from transformers import LlamaTokenizer
from datasets import load_dataset
import numpy as np
import pandas as pd
import warnings
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import json

#custome loss
from transformers import Trainer, TrainingArguments

# **Random seed 고정**

In [3]:
import random

seed = 40
deterministic = True

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
if deterministic:
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False
warnings.filterwarnings('ignore')

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# **dataset**

In [5]:
# triplet preference data
triplet_ds = load_dataset("haoranxu/ALMA-R-Preference", "cs-en")

In [6]:
triplet_ds

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 2009
    })
})

policy(y_w|x) - policy(y_l|x)

In [7]:
triplet_ds['train'][0]

{'translation': {'Delta': 0.0,
  'alma_cs': 'V pondělí oznámili vědci z Lékařské fakulty Stanfordovy univerzity vynález nového diagnostického nástroje, který dokáže buňky třídit podle typu: malý tisknutelný čip, který lze vyrábět pomocí standardních inkoustových tiskáren za zhruba jeden cent.',
  'alma_cs_kiwi': 0.9664803147315979,
  'alma_cs_kiwi_xcomet': 0.9676983654499054,
  'alma_cs_xcomet': 0.9689164161682129,
  'alma_en': "On Monday, scientists at Stanford University's School of Medicine announced the invention of a new diagnostic tool that can sort cells by type: a small printable chip that can be made using standard inkjet printers for about one cent apiece.",
  'alma_en_kiwi': 0.8873786926269531,
  'alma_en_kiwi_xcomet': 0.9009847342967987,
  'alma_en_xcomet': 0.9145907759666443,
  'cs': 'V pondělí vědci z Lékařské fakulty Stanfordovy univerzity oznámili vynález nového diagnostického nástroje, který dokáže třídit buňky podle typu: malý vytisknutelný čip, který lze vyrobit pomo

data **preprocess**

In [8]:
# Extract models and metrics
models = ['alma', 'gpt4']
#metrics = ['kiwi', 'kiwi_xcomet', 'xcomet']
metrics=['xcomet']

# Data preparation for DataFrame
df_data = {'metric': [], 'superior_model': [], 'inferior_model': [], 'en_input': [], 'sup_cs_label': [] , 'inf_cs_label' : []}

# Iterate through each metric to determine superior and inferior models
for input_data in triplet_ds['train']:
  input_data = input_data['translation']
  for metric in metrics:
    scores = {model: input_data[f"{model}_cs_{metric}"] for model in models}
    superior_model = max(scores, key=scores.get)
    inferior_model = min(scores, key=scores.get)
    df_data['metric'].append(metric)
    df_data['superior_model'].append(superior_model)
    df_data['inferior_model'].append(inferior_model)
    df_data['en_input'].append(input_data[f"en"])
    df_data['sup_cs_label'].append(input_data[f"{superior_model}_cs"])
    df_data['inf_cs_label'].append(input_data[f"{inferior_model}_cs"])

# Create DataFrame
df = pd.DataFrame(df_data)

# Display DataFrame
df

Unnamed: 0,metric,superior_model,inferior_model,en_input,sup_cs_label,inf_cs_label
0,xcomet,alma,gpt4,"On Monday, scientists from the Stanford Univer...",V pondělí oznámili vědci z Lékařské fakulty St...,V pondělí oznámili vědci ze Stanfordovy univer...
1,xcomet,gpt4,alma,Lead researchers say this may bring early dete...,"Hlavní výzkumníci uvádějí, že to může přinést ...","Hlavní výzkumníci tvrdí, že to může přinést ra..."
2,xcomet,gpt4,alma,The JAS 39C Gripen crashed onto a runway at ar...,JAS 39C Gripen havaroval na ranvej přibližně v...,JAS 39C Gripen se zřítil na letištní plochu v ...
3,xcomet,gpt4,alma,The pilot was identified as Squadron Leader Di...,Pilot byl identifikován jako velitel letky Dil...,Pilot byl identifikován jako Squadron Leader D...
4,xcomet,gpt4,alma,Local media reports an airport fire vehicle ro...,"Místní média hlásí, že vozidlo požární ochrany...","Místní média uvádějí, že hasičské vozidlo na l..."
...,...,...,...,...,...,...
2004,xcomet,alma,alma,"As the areas are sparsely populated, and light...","Oblasti jsou málo obydlené, takže světelné zne...","Oblasti jsou málo obydlené, takže světelné zne..."
2005,xcomet,alma,alma,Japanese work culture is more hierarchical and...,Japonská pracovní kultura je více hierarchická...,Japonská pracovní kultura je více hierarchická...
2006,xcomet,alma,gpt4,"Suits are standard business attire, and cowork...",Společenské oblečení je běžnou součástí pracov...,Obleky jsou standardní obchodní oděv a kolegov...
2007,xcomet,gpt4,alma,"Workplace harmony is crucial, emphasizing grou...","Pracovní harmonie je klíčová, když se klade dů...","Důležité je pracovní souznění, které zdůrazňuj..."


google drive mount

In [9]:
from google.colab import drive

# Google Drive를 마운트
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


drive에 데이터 저장

In [10]:
# Drive에 저장할 경로
train_save_path_triplet = '/content/drive/My Drive/df.csv'

# CSV 파일로 저장
df.to_csv(train_save_path_triplet)

In [11]:
df_train_triplet = pd.read_csv(train_save_path_triplet)

#df_train = pd.concat([df_train_hf, df_train_triplet], ignore_index=False)
df_train = pd.DataFrame(df_train_triplet)

In [12]:
for _, row in df_train.iterrows():
  print(row)
  break

Unnamed: 0                                                        0
metric                                                       xcomet
superior_model                                                 alma
inferior_model                                                 gpt4
en_input          On Monday, scientists from the Stanford Univer...
sup_cs_label      V pondělí oznámili vědci z Lékařské fakulty St...
inf_cs_label      V pondělí oznámili vědci ze Stanfordovy univer...
Name: 0, dtype: object


In [13]:
train_data = [
    {
        "User": f"Translate this from en to cs: en : {row['en_input']}",
        "sup_label": f"{row['sup_cs_label']}",
        "inf_label": f"{row['inf_cs_label']}"
    }
    for _, row in df_train.iterrows()
]

# **model lora**

rank is 16 and added paramters are only 12M parameters

In [14]:
# LoRA 설정
lora_config = LoraConfig(
    r=16,  # Low-rank 업데이트 행렬 차원
    lora_alpha=16,  # 스케일링 팩터
    lora_dropout=0.1,  # 드롭아웃 비율
    target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],  # QLoRA가 적용될 대상 모듈
)

# **model def**

In [15]:
# Load base model and LoRA weights
model_name = "haoranxu/ALMA-7B-Pretrain"

base_model = AutoModelForCausalLM.from_pretrained(model_name,
                                             device_map="auto",             # GPU와 CPU를 자동 분배
                                             torch_dtype="auto",            # 자동으로 적절한 데이터 타입(FP32, FP16 등) 선택
                                             offload_folder="./offload",    # 메모리가 부족할 경우 CPU로 데이터를 오프로드
                                             offload_state_dict=True)        # 가중치도 필요 시 CPU로 오프로드)

#기존 model freeze
for param in base_model.parameters():
    param.requires_grad = False

model = get_peft_model(base_model, lora_config)
tokenizer = LlamaTokenizer.from_pretrained("haoranxu/ALMA-7B-Pretrain", padding_side='left')

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

print really 12M parameteres are added

In [16]:
...

Ellipsis

# **token화**

In [None]:
max_label_length = 80

def replace_padding_with_ignore(labels, padding_value=128001, ignore_value=-100):
    """
    라벨에서 지정된 패딩 값을 -100으로 변환합니다.
    """
    return [ignore_value if token == padding_value else token for token in labels]

# input_ids, attention_mask, labels 생성
tokenized_data = [
    {
        **tokenizer(
            f"{item['User']} + {item['User']}",
            padding='max_length',
            truncation=True,
            max_length=max_label_length
        ),
        'labels':
            tokenizer(
                item['sup_label'] + item['inf_label'],
                padding='max_length',
                truncation=True,
                max_length=max_label_length
            )['input_ids']
            }
    for item in train_data
]

# 결과 확인
print(tokenized_data[:1])  # 앞의 두 개만 출력

# **model training**

custome loss class

In [18]:
class CustomTrainer(Trainer):
  def compute_loss(self, model, inputs , num_items_in_batch=None):
    sigmoid = nn.Sigmoid()
    softmax = nn.Softmax()
    kl_divergence = nn.KLDivLoss(reduction='batchmean')
    """
    Custom loss function implementation.
    Args:
        model: The model being trained.
        inputs: The inputs to the model (e.g., input_ids, attention_mask, labels).
        return_outputs: If True, also return model outputs.
    Returns:
        The computed loss (and optionally model outputs).
    """
    label = inputs['labels'].float()
    label = label[0,:40]

    # Forward pass
    outputs = model(**inputs, max_new_tokens=80)
    outputs = outputs[1] # batch_size , seq_len , vocab_size
    sup_outputs = outputs[0,:40,:]
    inf_outputs = outputs[0,40:,:]
    sup_probs = softmax(sup_outputs)
    inf_probs = softmax(inf_outputs)
    cloning_sup_probs = sup_probs.clone()

    # Custom loss computation
    loss_prefer = -torch.log(sigmoid(0.1*torch.log(sup_probs) - 0.1*torch.log(inf_probs)))

    #constraint
    kl_loss = -torch.log(cloning_sup_probs)

    #negative log
    #nll_loss = -torch.log(sup_probs)

    loss = loss_prefer + 2 * kl_loss

    loss = loss / 128
    loss = loss.mean()

    return loss

trainer configuration

deepspeed tool은 integrate하지 않음

In [19]:
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    per_device_train_batch_size=1,
    num_train_epochs=2,
    logging_dir='./logs',
    logging_steps=10,
    gradient_accumulation_steps=128,
    learning_rate=1e-5,
    fp16=True,
    warmup_ratio=0.01,  # Warm-up ratio 추가
)

In [20]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data
)

In [21]:
# 훈련 시작
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mqnckdrb14[0m ([33mqnckdrb14-does-not-exist[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,39.148
20,39.9647
30,33.3725


TrainOutput(global_step=30, training_loss=37.49505513509114, metrics={'train_runtime': 706.7996, 'train_samples_per_second': 5.685, 'train_steps_per_second': 0.042, 'total_flos': 1.22018485764096e+16, 'train_loss': 37.49505513509114, 'epoch': 1.911398705823793})

# **Experiment**

evaluation model

In [22]:
evaluate_tokenizer = AutoTokenizer.from_pretrained("Unbabel/xcomet")
evaluate_model = AutoModelForSequenceClassification.from_pretrained("Unbabel/xcomet")

NameError: name 'AutoTokenizer' is not defined

In [None]:
# 원문(source)와 번역(hypothesis) 준비
source = "This is an example of the original text."
hypothesis = "This is an example of translated text."

In [None]:
# 원문과 번역문을 하나로 결합하여 토크나이즈
inputs = tokenizer(
    text=source,
    text_pair=hypothesis,
    return_tensors="pt",
    truncation=True,
    padding="max_length",
)

In [None]:
# 모델로 번역 품질 점수 예측
outputs = model(**inputs)

# 로짓(logits)에서 점수 추출
logits = outputs.logits
# 점수 출력
quality_scores = logits.squeeze().tolist()
print(f"Predicted Quality Scores: {quality_scores}")

# **inference**

In [None]:
# Translation
with torch.no_grad():
    generated_ids = model.generate(input_ids=input_ids, num_beams=5, max_new_tokens=20, do_sample=True, temperature=0.6, top_p=0.9)
outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)