In [None]:
import os
import re
import pickle
import string
import unicodedata
from random import randint

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
filename = r"../input/summary-vietnamese/data_summary.csv"
df = pd.read_csv(filename).drop('Unnamed: 0',1)
print(f'Dataset size: {len(df)}')
df.sample(5)

In [None]:
'''Loại bỏ các giá trị trùng lặp và giá trị rỗng'''

df.drop_duplicates(subset=['Text'],inplace=True) #dropping duplicates
df.dropna(axis=0,inplace=True) #dropping na
df = df.reset_index().drop('index', axis = 1)
print(f'Dataset size: {len(df)}')
df.head(5)

# Data preparation

In [None]:
def cleanWord(s):
    miss = ['!','@','#','$','%','^','&','*','(',')','-','–','_','+','=','{','[','}',']','|',':',';','?','/','<','>','~','`','‘','’',',','()','“','"', '...','”', '“”','\\', '>>', '>>>']
    for i in miss:
        s = s.replace(i,'')
    return re.sub(r'\'', '', s)

def remove_link(string):
    return re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', string)

def remove_extra_whitespace(string):
    text = re.sub(r'\s+', ' ', string).strip()
    return text

def lower_word(data):
    return data.lower()

# Remove numbers from text
def rm_number_from_text(text):
    text = re.sub('[0-9]+', '', text)
    return ' '.join(text.split())  # to rm `extra` white space

# Remove puncuation from word
def rm_punc_from_word(word):
    clean_alphabet_list = [alphabet for alphabet in word if alphabet not in string.punctuation]
    return ''.join(clean_alphabet_list)

# Remove puncuation from text
def rm_punc_from_text(text):
    clean_word_list = [rm_punc_from_word(word) for word in text]
    return ''.join(clean_word_list)


# Cleaning text
def clean_text(text):
    text = text.lower()
    text = rm_number_from_text(text)
    text = rm_punc_from_text(text)

    # there are hyphen(–) in many titles, so replacing it with empty str
    # this hyphen(–) is different from normal hyphen(-)
    text = re.sub('–', '', text)
    text = ' '.join(text.split())  # removing `extra` white spaces

    # Removing unnecessary characters from text
    text = re.sub("(\\t)", ' ', str(text)).lower()
    text = re.sub("(\\r)", ' ', str(text)).lower()
    text = re.sub("(\\n)", ' ', str(text)).lower()

    text = re.sub("(__+)", ' ', str(text)).lower()
    text = re.sub("(--+)", ' ', str(text)).lower()
    text = re.sub("(~~+)", ' ', str(text)).lower()
    text = re.sub("(\+\++)", ' ', str(text)).lower()
    text = re.sub("(\.\.+)", ' ', str(text)).lower()

    text = re.sub(r"[<>()|&©ø\[\]\'\",;?~*!]", ' ', str(text)).lower()

    text = re.sub("(mailto:)", ' ', str(text)).lower()
    text = re.sub(r"(\\x9\d)", ' ', str(text)).lower()
    text = re.sub("([iI][nN][cC]\d+)", 'INC_NUM', str(text)).lower()
    text = re.sub("([cC][mM]\d+)|([cC][hH][gG]\d+)", 'CM_NUM',str(text)).lower()

    text = re.sub("(\.\s+)", ' ', str(text)).lower()
    text = re.sub("(\-\s+)", ' ', str(text)).lower()
    text = re.sub("(\:\s+)", ' ', str(text)).lower()
    text = re.sub("(\s+.\s+)", ' ', str(text)).lower()

    try:
        url = re.search(r'((https*:\/*)([^\/\s]+))(.[^\s]+)', str(text))
        repl_url = url.group(3)
        text = re.sub(r'((https*:\/*)([^\/\s]+))(.[^\s]+)', repl_url, str(text))
    except Exception as e:
        pass

    text = re.sub("(\s+)", ' ', str(text)).lower()
    text = re.sub("(\s+.\s+)", ' ', str(text)).lower()

    return text

def solve(string):
    func = [lower_word, remove_link, remove_extra_whitespace, cleanWord, clean_text]
    for i in func:
        string = i(string)
    return string

In [None]:
df.Text = df.Text.apply(lambda x: solve(str(x)))
df.Summary = df.Summary.apply(lambda x: solve(str(x)))
df.sample(5)

Finding what should be the **maximum length** of **Text** and **Summary** that will be feed or accepted by the learning algorithm

In [None]:
text_count = [len(sentence.split()) for sentence in df.Text]
summary_count = [len(sentence.split()) for sentence in df.Summary]

pd.DataFrame({'Text': text_count, 'Summary': summary_count}).hist(bins=100, figsize=(16, 4))
plt.show()

In [None]:
# To check how many rows in a column has length (of the text) <= limit
def get_word_percent(column, limit):
    count = 0
    for sentence in column:
        if len(sentence.split()) <= limit:
            count += 1

    return round(count / len(column), 2)


# Check how many % of summary have 0-60 words
print(get_word_percent(df.Summary, 60))

# Check how many % of text have 0-300 words
print(get_word_percent(df.Text, 400))

In [None]:
max_text_len = 300
max_summary_len = 60

In [None]:
# select the summary and text between their defined max lens respectively
def trim_text_and_summary(df, max_text_len, max_summary_len):
    cleaned_text = np.array(df['Text'])
    cleaned_summary = np.array(df['Summary'])

    short_text = []
    short_summary = []

    for i in range(len(cleaned_text)):
        if len(cleaned_text[i].split()) <= max_text_len and len(
            cleaned_summary[i].split()
        ) <= max_summary_len:
            short_text.append(cleaned_text[i])
            short_summary.append(cleaned_summary[i])

    df = pd.DataFrame({'Text': short_text, 'Summary': short_summary})
    return df


df = trim_text_and_summary(df, max_text_len, max_summary_len)
print(f'Dataset size: {len(df)}')
df.sample(5)

In [None]:
from vncorenlp import VnCoreNLP
rdrsegmenter = VnCoreNLP("./vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx2g') 

In [None]:
def read_content(string):
    original = rdrsegmenter.tokenize(string)
    original = ' '.join([' '.join(x) for x in original])

    return original

In [None]:
read_content(df.Text.iloc[1])

In [None]:
df.Text = df.Text.apply(lambda x: read_content(str(x)))
df.Summary = df.Summary.apply(lambda x: read_content(str(x)))
df.sample(5)

In [None]:
df = df.rename(columns={"Summary": "summary", "Text": "original"})

In [None]:
df

In [None]:
%%capture
!pip install datasets==1.0.2
!pip install transformers

import datasets
import transformers

In [None]:
from transformers import RobertaTokenizerFast,AutoTokenizer

# phobert = AutoModel.from_pretrained("vinai/phobert-base")

# For transformers v4.x+: 
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)

# tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

# train_data = datasets.load_dataset("xsum", split="train")
# val_data = datasets.load_dataset("xsum", split="validation[:10%]")

In [None]:
from datasets import Dataset
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(df, test_size=0.2)
train_data =  Dataset.from_pandas(train_data)
val_data =  Dataset.from_pandas(val_data)

In [None]:
batch_size=16 # change to 16 for full training
encoder_max_length=256
decoder_max_length=64

def process_data_to_model_inputs(batch):                                                               
    # Tokenizer will automatically set [BOS] <text> [EOS]                                               
    inputs = tokenizer(batch["original"], padding="max_length", truncation=True, max_length=encoder_max_length)
    outputs = tokenizer(batch["summary"], padding="max_length", truncation=True, max_length=decoder_max_length)
                                                                                                        
    batch["input_ids"] = inputs.input_ids                                                               
    batch["attention_mask"] = inputs.attention_mask                                                     
    batch["decoder_input_ids"] = outputs.input_ids                                                      
    batch["labels"] = outputs.input_ids.copy()                                                          
    # mask loss for padding                                                                             
    batch["labels"] = [                                                                                 
        [-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]
    ]                     
    batch["decoder_attention_mask"] = outputs.attention_mask                                                                              
                                                                                                         
    return batch  

# only use 32 training examples for notebook - DELETE LINE FOR FULL TRAINING
# train_data = train_data.select(range(32))

train_data_batch = train_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["original", "summary"],
)
train_data_batch.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)


# only use 16 training examples for notebook - DELETE LINE FOR FULL TRAINING
# val_data = val_data.select(range(16))

val_data_batch = val_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["original", "summary"],
)
val_data_batch.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

In [None]:
from transformers import EncoderDecoderModel

# set encoder decoder tying to True
roberta_shared = EncoderDecoderModel.from_encoder_decoder_pretrained("vinai/phobert-base", "vinai/phobert-base", tie_encoder_decoder=True)
# roberta_shared = EncoderDecoderModel.from_encoder_decoder_pretrained("roberta-base", "roberta-base", tie_encoder_decoder=True)

In [None]:
# set special tokens
roberta_shared.config.decoder_start_token_id = tokenizer.bos_token_id                                             
roberta_shared.config.eos_token_id = tokenizer.eos_token_id

# sensible parameters for beam search
# set decoding params                               
roberta_shared.config.max_length = 64
roberta_shared.config.early_stopping = True
roberta_shared.config.no_repeat_ngram_size = 3
roberta_shared.config.length_penalty = 2.0
roberta_shared.config.num_beams = 4
roberta_shared.config.vocab_size = roberta_shared.config.encoder.vocab_size  

In [None]:
%%capture
!rm seq2seq_trainer.py
!wget https://github.com/huggingface/transformers/blob/main/examples/legacy/seq2seq/seq2seq_trainer.py

!pip install git-python==1.0.3
!pip install sacrebleu==1.4.12
!pip install rouge_score

from transformers import Seq2SeqTrainer
from transformers import TrainingArguments
from dataclasses import dataclass, field
from typing import Optional

In [None]:
@dataclass
class Seq2SeqTrainingArguments(TrainingArguments):
    label_smoothing: Optional[float] = field(
        default=0.0, metadata={"help": "The label smoothing epsilon to apply (if not zero)."}
    )
    sortish_sampler: bool = field(default=False, metadata={"help": "Whether to SortishSamler or not."})
    predict_with_generate: bool = field(
        default=False, metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."}
    )
    adafactor: bool = field(default=False, metadata={"help": "whether to use adafactor"})
    encoder_layerdrop: Optional[float] = field(
        default=None, metadata={"help": "Encoder layer dropout probability. Goes into model.config."}
    )
    decoder_layerdrop: Optional[float] = field(
        default=None, metadata={"help": "Decoder layer dropout probability. Goes into model.config."}
    )
    dropout: Optional[float] = field(default=None, metadata={"help": "Dropout probability. Goes into model.config."})
    attention_dropout: Optional[float] = field(
        default=None, metadata={"help": "Attention dropout probability. Goes into model.config."}
    )
    lr_scheduler: Optional[str] = field(
        default="linear", metadata={"help": f"Which lr scheduler to use."}
    )

In [None]:
import datasets
# load rouge for validation
rouge = datasets.load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

In [None]:
# set training arguments - these params are not really tuned, feel free to change
training_args = Seq2SeqTrainingArguments(
    output_dir= './small-datasets-checkpoints/',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    # evaluate_during_training=True,
    do_train=True,
    do_eval=True,
    logging_steps=200,  # set to 2000 for full training
    save_steps=5000,  # set to 500 for full training
    eval_steps=7500,  # set to 7500 for full training
    warmup_steps=3000,  # set to 3000 for full training
    num_train_epochs=5, #uncomment for full training
    overwrite_output_dir=True,
    save_total_limit=50,
    fp16=True,
)

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=roberta_shared,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data_batch,
    eval_dataset=val_data_batch,
)
trainer.train()

In [None]:
!mkdir 'training'

In [None]:
!gsutil -m cp -r './training1/*' 'gs://kaggle-vbdi-test/training_Data'

In [None]:
import datasets
from transformers import RobertaTokenizer, EncoderDecoderModel, AutoTokenizer
from transformers import T5ForConditionalGeneration, T5Tokenizer
from sklearn.model_selection import train_test_split

# model = T5ForConditionalGeneration.from_pretrained("NlpHUST/t5-small-vi-summarization")
# tokenizer = T5Tokenizer.from_pretrained("NlpHUST/t5-small-vi-summarization")

# tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
# model = EncoderDecoderModel.from_pretrained("./small-datasets-checkpoints/checkpoint-7600")

tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=True)
model = EncoderDecoderModel.from_pretrained("./training1/checkpoint-4000")
model.to("cuda")

# test_data = datasets.load_dataset("xsum", split="test")

batch_size = 16  # change to 64 for full evaluation

# map data correctly
def generate_summary(batch):
    # Tokenizer will automatically set [BOS] <text> [EOS]
    inputs = tokenizer(batch["original"], padding="max_length", truncation=True, max_length=256, return_tensors="pt")
    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")

    outputs = model.generate(input_ids, attention_mask=attention_mask)

    # all special tokens including will be removed
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    batch["pred"] = output_str

    return batch

results = val_data.map(generate_summary, batched=True, batch_size=batch_size, remove_columns=["original"])

pred_str = results["pred"]
label_str = results["summary"]

In [None]:
rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge1","rouge2","rougeL"])

In [None]:
for key,value in rouge_output.items():
    print(key)
    print(value.mid)