In [None]:
!nvidia-smi

In [None]:
!pip install --quiet torchtext
!pip install --quiet transformers
!pip install --quiet pytorch-lightning
!pip install --quiet SentencePiece
!pip install --quiet gdown

In [None]:
import pandas as pd
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import DataLoader, Dataset
import textwrap
import tensorflow as tf
import os
from tqdm import tqdm

from transformers import (
    MT5Tokenizer,
    MT5ForConditionalGeneration,
    AdamW
)

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

import seaborn as sns
from pylab import rcParams

%config InlineBackend.figure_format = "retina" 
sns.set(style = 'whitegrid', palette='muted', font_scale = 1.2)
rcParams['figure.figsize'] = 16, 10

## Download Data from drive

In [None]:
# train (%50)
!gdown --id #TRAINSET_DRIVELINK

In [None]:
# test
!gdown --id #TESTSET_DRIVELINK

In [None]:
from urllib import request

def try_download(url, path, n_trials = 10):
    i = 0
    while i < n_trials:
        try:
            request.urlretrieve(url, path)
            print(f'Downloaded: {path}')
        except Exception as e:
            i += 1
            print(str(e))
            print(f'[{i}/{n_trials}] Error downloading: {url}')
            continue
        else:
            return
    raise SystemExit('Cannot download optimizer')

In [None]:
model_folder = '/kaggle/temp/'

os.makedirs(model_folder, exist_ok=True)
model_file = os.path.join(model_folder, 'model.ckpt')

checkpoint_url = '.../output/best-checkpoint.ckpt' # URL of the checkpoints downloaded in the first round

try_download(checkpoint_url, model_file)

## Reading Data

In [None]:
train_df = pd.read_csv("train_50.csv", sep='\t', encoding= 'utf-8')
train_df = train_df[["title", "abstract"]]
train_df = train_df.dropna()
train_df.head()

In [None]:
test_df = pd.read_csv("test_10.csv", sep='\t', encoding= 'utf-8')
test_df = test_df[["title", "abstract", "topic"]]
test_df = test_df.dropna()
test_df.head()

In [None]:
valid_df = test_df

## MT5 Tokenizer

In [None]:
MODEL_SOURCE = 'google/mt5-small'
mT5tokenizer = MT5Tokenizer.from_pretrained(MODEL_SOURCE)

## Dataset Class

In [None]:
class TrDataset(Dataset):
  def __init__(
      self,
      data : pd.DataFrame,
      tokenizer: mT5tokenizer,
      title_max_token_length: int = 70,
      abstract_max_token_length: int = 370,

  ):
    self.tokenizer = tokenizer
    self.data = data
    self.title_max_token_length = title_max_token_length
    self.abstract_max_token_length = abstract_max_token_length


  def __getitem__(self, index: int):
    element = self.data.iloc[index]
    title = element['title']
    abstract =  element['abstract']

    encoded_title = self.tokenizer(
        title, 
        max_length = self.title_max_token_length,
        padding = 'max_length',
        truncation= True,
        return_attention_mask = True,
        add_special_tokens = True,
        return_tensors = 'pt' 
    )
    encoded_abstract = self.tokenizer(
        abstract, 
        max_length = self.abstract_max_token_length,
        padding = 'max_length',
        truncation= True,
        return_attention_mask = True,
        add_special_tokens = True,
        return_tensors = 'pt' 
    )
 
    labels = encoded_abstract['input_ids']
    labels[labels == 0] = -100

    return dict(
        title = title,
        abstract = abstract,
        title_input_ids = encoded_title['input_ids'].flatten(),
        title_attention_mask = encoded_title['attention_mask'].flatten(),
        labels = labels.flatten(),
        labels_attention_mask = encoded_abstract['attention_mask'].flatten()
    )

  def __len__(self):
    return len(self.data)

In [None]:
class TrDataModule(pl.LightningDataModule):
  def __init__(
    self,
    train_df: pd.DataFrame,
    valid_df: pd.DataFrame,
    tokenizer: mT5tokenizer,
    batch_size: int = 8,
    title_max_token_length: int = 70,
    abstract_max_token_length: int = 370
  ):
    super().__init__()
    self.train_df = train_df
    self.valid_df = valid_df

    self.tokenizer = tokenizer
    self.batch_size = batch_size
    self.title_max_token_length = title_max_token_length
    self.abstract_max_token_length = abstract_max_token_length


  def setup(self, stage = None):
    self.train_dataset = TrDataset(
      self.train_df,
      self.tokenizer,
      self.title_max_token_length,
      self.abstract_max_token_length
    )    
    self.valid_dataset = TrDataset(
      self.valid_df,
      self.tokenizer,
      self.title_max_token_length,
      self.abstract_max_token_length
    )
    

  def train_dataloader(self):
    return DataLoader(
        self.train_dataset,
        batch_size= self.batch_size,
        shuffle = True,
        num_workers = 2
    )
  

  def val_dataloader(self):
    return DataLoader(
        self.valid_dataset,
        batch_size= self.batch_size,
        shuffle = False,
        num_workers = 2
    )

In [None]:
EPOCHS = 4
BATCH_SIZE = 4
data_module = TrDataModule(
    train_df= train_df,
    valid_df= valid_df,
    tokenizer = mT5tokenizer,
    batch_size= BATCH_SIZE ,
    title_max_token_length = 70,
    abstract_max_token_length = 370)

In [None]:
class GenerateModel(pl.LightningModule):
    
  def __init__(self):
    super().__init__()
    self.model = MT5ForConditionalGeneration.from_pretrained(MODEL_SOURCE,return_dict = True)
  
  def forward(self, input_ids , attention_mask, decoder_attention_mask, labels = None):
    output = self.model(
        input_ids = input_ids,
        attention_mask = attention_mask,
        decoder_attention_mask = decoder_attention_mask,
        labels = labels
    )
    return output.loss, output.logits


  def training_step(self, batch, batch_idx):
    input_ids = batch['title_input_ids']
    attention_mask = batch['title_attention_mask']
    labels = batch['labels']
    labels_attention_mask = batch['labels_attention_mask']
    loss, outputs = self(
        input_ids = input_ids,
        attention_mask = attention_mask,
        decoder_attention_mask = labels_attention_mask,
        labels = labels,
    )
    self.log("train_loss", loss , prog_bar = True, logger = True)
    return loss


  def validation_step(self, batch, batch_idx):
    input_ids = batch['title_input_ids']
    attention_mask = batch['title_attention_mask']
    labels = batch['labels']
    labels_attention_mask = batch['labels_attention_mask']
    loss, outputs = self(
        input_ids = input_ids,
        attention_mask = attention_mask,
        decoder_attention_mask = labels_attention_mask,
        labels = labels,
    )    
    self.log("val_loss", loss , prog_bar = True, logger = True)
    return loss
      
      
  def configure_optimizers(self):
    return AdamW(self.parameters(), lr = 1e-4)

In [None]:
# the code below will be run at the beginning by default
model = GenerateModel()

# the code below will be run after first round with loaded checkpoints
#model = GenerateModel.load_from_checkpoint(model_file)

In [None]:
checkpoint_callbackk = ModelCheckpoint(
    dirpath='output',
    filename="best-checkpoint",
    save_top_k= 1,
    verbose=True,
    monitor="val_loss",
    mode="min"
)
logger = TensorBoardLogger("lightning_logs", name="text-generation")

In [None]:
trainer = pl.Trainer(
    enable_checkpointing=True,
    callbacks=checkpoint_callbackk,
    logger= logger,
    max_epochs = EPOCHS,
    accelerator='gpu'
)

## Training

In [None]:
trainer.fit(model,data_module)

## Generation

In [None]:
trained_model = GenerateModel.load_from_checkpoint(model_file)
trained_model.freeze()

In [None]:
tf.random.set_seed(0)

def generate(title):
    encoded_title= mT5tokenizer(
    title,
    max_length= 370,
    padding= "max_length",
    truncation= True,
    add_special_tokens= True,
    return_attention_mask=True,
    return_tensors= "pt"
    )
    generated_ids = trained_model.model.generate(
        input_ids = encoded_title["input_ids"],
        attention_mask = encoded_title["attention_mask"],
        max_length= 70,
        do_sample=True,
        early_stopping = True,
        #top_k=70,
        #temperature=0.7,
        #top_p = 0.5,
        num_return_sequences = 1
    )
    preds = [mT5tokenizer.decode(generated_id, skip_special_tokens= True, clean_up_tokenization_spaces = True) 
    for generated_id in generated_ids
    ]
    return preds

## Record

In [None]:
title = []
abstract = []
generated_ws = []
topic = []

In [None]:
def example_record(num):
    for n in tqdm(range(num)): 
        
        title.append(test_df['title'][n])
        abstract.append(test_df['abstract'][n])
        generated_ws.append(generate(test_df['title'][n]))
        topic.append(test_df['topic'][n])

In [None]:
example_record(len(test_df))

In [None]:
rec = pd.DataFrame(columns=['title', 'abstract', 'detailed', 'topic'])

In [None]:
rec['title'] = title
rec['abstract'] = abstract
rec['detailed'] = generated_ws
rec['topic'] = topic

In [None]:
rec.head()

In [None]:
rec.to_csv("/kaggle/working/results.csv", sep="\t", encoding="utf-8")