In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, load_from_disk
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AdamW, DataCollatorForSeq2Seq
from torch.utils.data import DataLoader
import torch
import wandb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
github_issues = pd.read_csv("../data/github_issues.csv")
github_issues.head()

Unnamed: 0,issue_url,issue_title,body
0,"""https://github.com/zhangyuanwei/node-images/i...",can't load the addon. issue to: https://github...,can't load the addon. issue to: https://github...
1,"""https://github.com/Microsoft/pxt/issues/2543""",hcl accessibility a11yblocking a11ymas mas4.2....,user experience: user who depends on screen re...
2,"""https://github.com/MatisiekPL/Czekolada/issue...",issue 1265: issue 1264: issue 1261: issue 1260...,┆attachments: <a href= https:& x2f;& x2f;githu...
3,"""https://github.com/MatisiekPL/Czekolada/issue...",issue 1266: issue 1263: issue 1262: issue 1259...,gitlo = github x trello\n---\nthis board is no...
4,"""https://github.com/MatisiekPL/Czekolada/issue...",issue 1288: issue 1285: issue 1284: issue 1281...,┆attachments: <a href= https:& x2f;& x2f;githu...


In [3]:
train_df, test_df = train_test_split(github_issues[['body', 'issue_title']], test_size=0.2, random_state=42)
# Chuyển đổi DataFrame thành Dataset của Hugging Face
train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
test_dataset = Dataset.from_pandas(test_df, preserve_index=False)
# Tạo DatasetDict từ các Dataset đã chuyển đổi
full_dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

In [4]:
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-base")
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")



In [5]:
def preprocess_function(examples):
    inputs = ["Summary of the following GitHub issue: " + doc for doc in examples["body"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length", return_tensors='pt')
    labels = tokenizer(examples["issue_title"], max_length=256, truncation=True, padding="max_length", return_tensors='pt')
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_data = full_dataset_dict.map(preprocess_function, batched=True, remove_columns=["body", "issue_title"])

Map:   0%|          | 0/4265722 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066431 [00:00<?, ? examples/s]

In [6]:
tokenized_data.save_to_disk('../data/full_dataset_dict')

Saving the dataset (0/62 shards):   0%|          | 0/4265722 [00:00<?, ? examples/s]

Saving the dataset (0/16 shards):   0%|          | 0/1066431 [00:00<?, ? examples/s]

In [2]:
tokenized_data = load_from_disk('../data/full_dataset_dict')

Loading dataset from disk:   0%|          | 0/66 [00:00<?, ?it/s]

Loading dataset from disk:   0%|          | 0/17 [00:00<?, ?it/s]

In [7]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model="facebook/bart-base")
train_dataloader = DataLoader(tokenized_data['train'], shuffle=True, batch_size=2, collate_fn=data_collator)
test_dataloader = DataLoader(tokenized_data['test'], batch_size=2, collate_fn=data_collator)

In [8]:
# Thiết lập thiết bị
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_lay

In [9]:
wandb.login(key='6a323cfe5341553410214585d588f10485e8ac66')
wandb.init(project="summarization", name='seminar2_kaggle_bartpho_newest')

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\Admin\_netrc
[34m[1mwandb[0m: Currently logged in as: [33mthichhocchui1999[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [10]:
import os
# Thiết lập optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)
checkpoint_dir = '../checkpoint'
# Vòng lặp huấn luyện
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        # Chuyển batch sang thiết bị (CPU hoặc GPU)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Đặt gradients về 0
        optimizer.zero_grad()

        # Tính toán loss
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Backpropagation
        loss.backward()
        optimizer.step()

        # Log loss mỗi batch lên W&B
        wandb.log({"train_loss": loss.item()})

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss}")
    model.save_pretrained(checkpoint_dir+ f'/epoch_{epoch + 1}', from_pt=True) 
    tokenizer.save_pretrained(checkpoint_dir+ f'/epoch_{epoch + 1}', from_pt=True)
    artifact = wandb.Artifact(f'model-bartpho-checkpoint-epoch-{epoch + 1}', type='model')
    artifact.add_dir(checkpoint_dir + f'/epoch_{epoch + 1}')
    wandb.log_artifact(artifact)
    # Đánh giá mô hình trên tập test
    model.eval()
    with torch.no_grad():
        total_val_loss = 0
        for batch in test_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_val_loss += loss.item()

            # Log validation loss mỗi batch lên W&B
            wandb.log({"val_loss": loss.item()})

        avg_val_loss = total_val_loss / len(test_dataloader)
        print(f"Validation Loss: {avg_val_loss}")

# Kết thúc W&B run
wandb.finish()

  attn_output = torch.nn.functional.scaled_dot_product_attention(


# Evaluate

In [4]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AdamW, DataCollatorForSeq2Seq
from torch.utils.data import DataLoader

model = AutoModelForSeq2SeqLM.from_pretrained("minhtuan7akp/bart_github_summarization")
tokenizer = AutoTokenizer.from_pretrained("minhtuan7akp/bart_github_summarization")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [6]:
import torch

# Thiết lập thiết bị
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), 

In [36]:
def summary(sentence):
    text =  "Summary of the following GitHub issue: " + sentence
    encoding = tokenizer(text, max_length=1024, truncation=True, padding="max_length", return_tensors='pt')
    input_ids, attention_masks = encoding["input_ids"].to("cuda"), encoding["attention_mask"].to("cuda")
    outputs = model.generate(
        input_ids=input_ids, attention_mask=attention_masks,
        max_length=256
    )
    line = tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return line
sentence = '''

'''
sentence = github_issues.iloc[10000]['body']
summary(sentence)

'add value ranges to table formats on that page'

In [39]:
evaluate_github_issues = github_issues[-3000:].reset_index(drop=True)
evaluate_github_issues['summary_bartpho'] = evaluate_github_issues['body'].apply(summary)

In [42]:
from rouge import Rouge

rouge = Rouge()
def caculate_rouge(candidate, reference):
    rouge_scores = rouge.get_scores(candidate, reference)
    return rouge_scores[0]['rouge-1']['f'], rouge_scores[0]['rouge-2']['f'], rouge_scores[0]['rouge-l']['f']
# Khởi tạo danh sách để lưu các chỉ số ROUGE
rouge_1_scores = []
rouge_2_scores = []
rouge_l_scores = []
# Duyệt qua từng cặp văn bản từ hai cột và tính các chỉ số ROUGE
for index, row in evaluate_github_issues.iterrows():
    candidate = row['summary_bartpho']
    reference = row['issue_title']
    rouge_1, rouge_2, rouge_l = caculate_rouge(candidate, reference)
    rouge_1_scores.append(rouge_1)
    rouge_2_scores.append(rouge_2)
    rouge_l_scores.append(rouge_l)

# Tính trung bình của từng chỉ số ROUGE
avg_rouge_1 = sum(rouge_1_scores) / len(rouge_1_scores)
avg_rouge_2 = sum(rouge_2_scores) / len(rouge_2_scores)
avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)
print(f"Average ROUGE-1 F1 Score: {avg_rouge_1}")
print(f"Average ROUGE-2 F1 Score: {avg_rouge_2}")
print(f"Average ROUGE-L F1 Score: {avg_rouge_l}")

Average ROUGE-1 F1 Score: 0.4541229725338228
Average ROUGE-2 F1 Score: 0.3695844393689238
Average ROUGE-L F1 Score: 0.44734915895970256


In [None]:
# data loader

def load_data(data_path: str, test_size=0.2):
    github_issues = pd.read_csv(data_path)
    train_df, test_df = train_test_split(github_issues[['body', 'issue_title']], test_size=test_size, random_state=42)
    
    train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
    test_dataset = Dataset.from_pandas(test_df, preserve_index=False)
    
    full_dataset_dict = DatasetDict({
        'train': train_dataset,
        'test': test_dataset
    })
    
    return full_dataset_dict

def preprocess_function(examples, tokenizer):
    inputs = ["Summary of the following GitHub issue: " + doc for doc in examples["body"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length", return_tensors='pt')
    labels = tokenizer(examples["issue_title"], max_length=256, truncation=True, padding="max_length", return_tensors='pt')
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [None]:
#training
def train_model(model, train_dataloader, test_dataloader, tokenizer, epochs=5, batch_size=2, lr=5e-5, checkpoint_dir='../checkpoint'):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    optimizer = AdamW(model.parameters(), lr=lr)
    
    wandb.login(key=os.getenv("WANDB_API_KEY"))
    wandb.init(project="summarization", name='seminar2_kaggle_bartpho_newest')

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_dataloader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()
            wandb.log({"train_loss": loss.item()})
        
        avg_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss}")
        model.save_pretrained(f'{checkpoint_dir}/epoch_{epoch + 1}')
        tokenizer.save_pretrained(f'{checkpoint_dir}/epoch_{epoch + 1}')
        
        artifact = wandb.Artifact(f'model-bartpho-checkpoint-epoch-{epoch + 1}', type='model')
        artifact.add_dir(f'{checkpoint_dir}/epoch_{epoch + 1}')
        wandb.log_artifact(artifact)
        
        evaluate_model(model, test_dataloader, device)
    
    wandb.finish()

def evaluate_model(model, test_dataloader, device):
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_val_loss += loss.item()

            wandb.log({"val_loss": loss.item()})

    avg_val_loss = total_val_loss / len(test_dataloader)
    print(f"Validation Loss: {avg_val_loss}")
