In [20]:
#Step 0.0: Mount Google Drive in Colab

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Step 0.1: Install Dependencies

!pip install nltk transformers torch
import nltk
nltk.download('punkt')

In [22]:
#Step 0.2: Import Libraries

import os
import json
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from nltk.tokenize import sent_tokenize

# Train

In [26]:
#Step 1: Read the JSON Files and Preprocess

# Function to extract data from a JSON file and derive labels
def extract_data_from_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
        passage = data["Meta(Refine)"]["passage"]
        summary = data["Annotation"]["summary3"]

        # Splitting the passage and summary into sentences
        passage_sentences = sent_tokenize(passage)
        summary_sentences = set(sent_tokenize(summary))

        # Label each sentence
        labels = [1 if sentence in summary_sentences else 0 for sentence in passage_sentences]

        return passage_sentences, labels

# Function to prepare data (either training or validation)
def prepare_data(file_names, folder_path, num_files):
    passages = []
    labels_list = []

    # Randomly sample 'num_files' from 'file_names'
    sampled_files = random.sample(file_names, num_files)

    for file_name in sampled_files:
        file_path = os.path.join(folder_path, file_name)
        passage_sentences, labels = extract_data_from_json(file_path)
        passages.extend(passage_sentences)
        labels_list.extend(labels)

    return passages, labels_list

# Read each JSON file, extract data, and derive labels
folder_path = '/content/drive/My Drive/20per' #022.요약문 및 레포트 생성 데이터 from AI Hub
file_names = os.listdir(folder_path)

# Number of files to use for training
num_train_files = 5  # Adjust this number as needed

# Prepare validation data
passages, labels = prepare_data(file_names, folder_path, num_train_files)


In [27]:
#Step 2: Create a Dataset Class

class TextSummarizationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        # Tokenize text
        encoding = self.tokenizer.encode_plus(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          return_token_type_ids=False,
          padding='max_length',
          return_attention_mask=True,
          return_tensors='pt',
          truncation=True
        )

        return {
          'text': text,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'labels': torch.tensor(label, dtype=torch.long)
        }

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Create dataset
dataset = TextSummarizationDataset(passages, labels, tokenizer)

In [28]:
#Step 3: Prepare DataLoader and Training Loop

# Data loader
data_loader = DataLoader(dataset, batch_size=2)

# Load model
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)

# Training loop
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
num_epochs = 1
total_steps = len(data_loader) * num_epochs

print("Starting training...")

for epoch in range(num_epochs):
    for i, batch in enumerate(data_loader):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        # Backward pass and optimization
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # Calculate current step
        current_step = epoch * len(data_loader) + i + 1

        # Print progress every 'n' steps (adjust 'n' as needed)
        if current_step % 10 == 0:  # Example: Print every 10 steps
            percentage_done = (current_step / total_steps) * 100
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{current_step}/{total_steps}]: {percentage_done:.2f}% complete, Loss: {loss.item()}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...
Epoch [1/1], Step [10/53]: 18.87% complete, Loss: 0.02224959246814251
Epoch [1/1], Step [20/53]: 37.74% complete, Loss: 0.28112056851387024
Epoch [1/1], Step [30/53]: 56.60% complete, Loss: 0.26235395669937134
Epoch [1/1], Step [40/53]: 75.47% complete, Loss: 0.17496606707572937
Epoch [1/1], Step [50/53]: 94.34% complete, Loss: 0.2239627093076706


# Validation

In [32]:
#Step 1: Validation Data Preparation and DataLoader

# File paths for validation data
val_folder_path = '/content/drive/My Drive/20per_val' #022.요약문 및 레포트 생성 데이터 from AI Hub
val_file_names = os.listdir(val_folder_path)

# Number of files to use for validating
num_val_files = 3  # Adjust this number as needed

# Prepare validation data
val_passages, val_labels = prepare_data(val_file_names, val_folder_path, num_val_files)

# Validation dataset and data loader
val_dataset = TextSummarizationDataset(val_passages, val_labels, tokenizer)
val_data_loader = DataLoader(val_dataset, batch_size=2)


In [33]:
#Step 2: Model Evaluation Function

# Function to evaluate the model
def evaluate_model(model, data_loader):
    model.eval()  # Set model to evaluation mode
    predictions = []
    actuals = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).tolist()

            predictions.extend(preds)
            actuals.extend(labels.tolist())

    # Calculate and return metrics here
    # Example: Accuracy
    accuracy = sum([1 if pred == actual else 0 for pred, actual in zip(predictions, actuals)]) / len(predictions)
    return accuracy

# Evaluate the model
model_accuracy = evaluate_model(model, val_data_loader)
print(f"Validation Accuracy: {model_accuracy}")

Validation Accuracy: 0.8307692307692308
