In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, get_linear_schedule_with_warmup
import sacrebleu
from tqdm.auto import tqdm
import csv

class LyricsDataset(Dataset):
    def __init__(self, tokenizer, dataframe, max_length=512):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        instrumental_data = item['instrumental']
        lyrics_data = item['lyrics']
        source = f"instrumental: {instrumental_data}"
        target = f"lyrics: {lyrics_data}"

        source_encodings = self.tokenizer(source, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")
        target_encodings = self.tokenizer(target, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")

        return source_encodings.input_ids.squeeze(), target_encodings.input_ids.squeeze()

# 데이터 로드 및 분할
df = pd.read_parquet('/content/drive/MyDrive/cleaned_dataset_t5_2.parquet')
train_size = int(0.9 * len(df))
val_size = len(df) - train_size

# Use BART tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
dataset = LyricsDataset(tokenizer, df)
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# 모델 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Use BART model
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base').to(device)
optimizer = AdamW(model.parameters(), lr=1e-4)

# 학습
model.train()
for epoch in range(50):  # 학습 에포크
    for source, target in tqdm(train_loader, desc=f"Training Epoch {epoch}"):
        source, target = source.to(device), target.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids=source, labels=target)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# 검증 및 BLEU 점수 계산
model.eval()
predictions, actuals = [], []
with torch.no_grad():
    for source, target in tqdm(val_loader, desc="Validating"):
        source = source.to(device)
        outputs = model.generate(input_ids=source, max_length=512, num_beams=5)
        pred_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        true_text = tokenizer.batch_decode(target, skip_special_tokens=True)

        predictions.extend(pred_text)
        actuals.extend([true_text])

# BLEU-2gram 스코어 구하기
bleu_2_score = sacrebleu.corpus_bleu(predictions, actuals, weights=(0.5, 0.5)).score

# BLEU-3gram 스코어 구하기
bleu_3_score = sacrebleu.corpus_bleu(predictions, actuals, weights=(0.33, 0.33, 0.33)).score

print(f"BLEU-2 Score: {bleu_2_score}")
print(f"BLEU-3 Score: {bleu_3_score}")

# 결과 저장
with open('/content/drive/MyDrive/lyrics_predictions.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Predicted Lyrics', 'Actual Lyrics'])
    for pred, actual in zip(predictions, actuals):
        writer.writerow([pred, actual[0]])


# 모델 저장
model.save_pretrained('/content/drive/MyDrive/saved_model_bart')
tokenizer.save_pretrained('/content/drive/MyDrive/saved_tokenizer_bart')

