In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/customized-knowledge-qa
!pip install python-docx pandas datasets scikit-learn transformers accelerate python-Levenshtein fuzzywuzzy

# **Extract knowledge**

In [None]:
from docx import Document

def extract_knowledge(data_dir):
    doc = Document(data_dir)
    knowledge = {}

    # Initialize variables
    current_report = None
    current_paragraph = None

    # Iterate over paragraphs
    for paragraph in doc.paragraphs:
        # Retrieve the text without formatting
        text = paragraph.text.strip()

        # Reset if empty line
        if not text:
            current_report = None
            current_paragraph = None
        # Check if the paragraph is a new report's title
        elif not current_report:
            current_report = text
            knowledge[current_report] = {}
        # Check if the paragraph is a paragraph's subtitle
        elif paragraph.runs[0].bold:
            current_paragraph = text
            knowledge[current_report][current_paragraph] = current_paragraph + '.'
        # Check if the paragraph is a new paragraph's content
        else:
            if not current_paragraph:
                current_paragraph = current_report
                knowledge[current_report][current_paragraph] = current_paragraph + '.'
            knowledge[current_report][current_paragraph] += ' ' + text
            current_paragraph = None

    return knowledge

In [None]:
knowledge_dir = 'knowledge/Template_phan_tich_doanh_nghiep.docx'
knowledge = extract_knowledge(knowledge_dir)

In [None]:
knowledge

# **Prepare data**

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from fuzzywuzzy import fuzz
from datetime import datetime

def stringify(value):
    if isinstance(value, str):
        return value
    elif isinstance(value, float) and 0 < value < 1:
        return f'{int(value * 100)}%'
    elif isinstance(value, datetime):
        return value.strftime('%-d/%-m/%Y')
    else:
        return str(value)

def find_substring_index(string, substring):
    threshold = 30
    best_match = 0
    best_match_index = None
    string = string.lower()
    substring = substring.lower()

    for i in range(len(string) - len(substring) + 1):
        similarity = fuzz.ratio(substring, string[i:i + len(substring)])
        if similarity == 100 and len(substring) >= 2 and substring[0:2] == string[i:i+2]:
            return i
        if similarity > threshold and (similarity > best_match or similarity == best_match and len(substring) >= 2 and substring[0:2] == string[i:i+2]):
            best_match = similarity
            best_match_index = i

    return best_match_index

def preprocess(dataset_dir):
    qa_data = pd.read_excel(dataset_dir)

    ids = []
    contexts = []
    questions = []
    answers = []

    for index, row in qa_data.iterrows():
        id = str(index).zfill(8)
        context = row['Tên đoạn'] + ' ' + knowledge[row['Tên bài']][row['Tên đoạn']]
        question = row['List câu hỏi']
        text = stringify(row['List câu trả lời'])
        answer = {'answer_start': [find_substring_index(context, text)], 'text': [text]}

        ids.append(id)
        contexts.append(context)
        questions.append(question)
        answers.append(answer)

    return [ids, contexts, questions, answers]

def create_datasetDict(datasest_dir, test_size):
    icqa = preprocess(dataset_dir)
    data = {'id': icqa[0], 'context': icqa[1], 'question': icqa[2], 'answer': icqa[3]}

    # Building dataset with format same to SQuAD 2.0
    df = pd.DataFrame(data = data)
    train_df, valid_df = train_test_split(df, test_size = test_size, random_state = 1, shuffle = True)

    train_dict = Dataset.from_dict(train_df)
    valid_dict = Dataset.from_dict(valid_df)
    dataset_dict = DatasetDict({'train': train_dict, 'validation': valid_dict})
    return dataset_dict

In [None]:
dataset_dir = 'dataset/List_cau_hoi-Phan_tich_doanh_nghiep-Short.xlsx'
test_size = 0.05
qa_data = create_datasetDict(dataset_dir, test_size)

In [None]:
# Display data
for entry in qa_data['train']:
  print(entry)

In [None]:
# Display non-matching (test fuzzy)
'''for entry in data:
  id = entry['answers']['answer_start'][0][1]
  answer = entry['answers']['text'][0].lower()
  if not id:
    print("ERROR " + answer)
  extract = entry['context'][id:(id+len(answer))].lower()
  if extract != answer:
    print([extract, answer])'''

# **Extract Features**

In [None]:
from transformers import XLMRobertaTokenizerFast

max_length = 256
stride = 64
model_checkpoint = 'bhavikardeshna/xlm-roberta-base-vietnamese'
tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_checkpoint)

def train_processing(dataset):
    questions = [q.strip() for q in dataset['question']]
    inputs = tokenizer(questions, dataset['context'],
                       max_length = max_length,
                       truncation = 'only_second',
                       stride = stride,
                       return_overflowing_tokens = True,
                       return_offsets_mapping = True,
                       padding = 'max_length')
    # Start char and end char of each token
    offset_mapping = inputs.pop('offset_mapping')
    sample_map = inputs.pop('overflow_to_sample_mapping')
    answers = dataset['answer']
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer['answer_start'][0]
        end_char = start_char + len(answer['text'][0])
        sequence_ids = inputs.sequence_ids(i)

        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        # Index of the start token of the context
        context_start = idx

        while sequence_ids[idx] == 1:
            idx += 1
        # Index of the end token of the context
        context_end = idx - 1

        # Create label
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs['start_positions'] = start_positions
    inputs['end_positions'] = end_positions

    return inputs

def valid_processing(dataset):
    questions = [q.strip() for q in dataset['question']]
    inputs = tokenizer(questions, dataset['context'],
                       max_length = max_length,
                       truncation = 'only_second',
                       stride = stride,
                       return_overflowing_tokens = True,
                       return_offsets_mapping = True,
                       padding = 'max_length')

    sample_map = inputs.pop('overflow_to_sample_mapping')
    example_ids = []
    for i in range(len(inputs['input_ids'])):
        sample_idx = sample_map[i]
        example_ids.append(dataset['id'][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs['offset_mapping'][i]
        inputs['offset_mapping'][i] = [j if sequence_ids[k] == 1 else None for k, j in enumerate(offset)]

    inputs['example_id'] = example_ids
    return inputs

In [None]:
train_dataset = qa_data['train'].map(train_processing,
                                     batched = True,
                                     remove_columns = qa_data['train'].column_names)
valid_dataset = qa_data['validation'].map(valid_processing,
                                          batched = True,
                                          remove_columns = qa_data['validation'].column_names)

In [None]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 131
})

In [None]:
valid_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 7
})

# **Training**

In [None]:
import transformers
from transformers import XLMRobertaForQuestionAnswering
from transformers import XLMRobertaConfig, XLMRobertaModel


model_checkpoint = 'bhavikardeshna/xlm-roberta-base-vietnamese'
prepare_model = XLMRobertaModel.from_pretrained(model_checkpoint)
config = XLMRobertaConfig()
config.vocab_size = tokenizer.vocab_size

batch_size = 8
learning_rate = 2e-5
epochs = 5
max_norm = 1.0
finetuned_model_dir = 'model/xlm-roberta-base-vn-dplat'

QA_model = XLMRobertaForQuestionAnswering(prepare_model.config).from_pretrained(model_checkpoint)

# Fine-Tune with TrainingArguments and Trainer
args = transformers.TrainingArguments(finetuned_model_dir,
                                      evaluation_strategy = 'no',
                                      do_train = True,
                                      per_device_train_batch_size = batch_size,
                                      per_device_eval_batch_size = batch_size,
                                      learning_rate = learning_rate,
                                      weight_decay = 0.01,
                                      adam_beta1 = 0.9,
                                      adam_beta2 = 0.999,
                                      adam_epsilon = 1e-8,
                                      max_grad_norm = max_norm,
                                      # fp16 = True, # on cuda
                                      num_train_epochs = epochs,
                                      logging_strategy = 'epoch')

trainer = transformers.Trainer(model = QA_model,
                               args = args,
                               train_dataset = train_dataset,
                               eval_dataset = valid_dataset,
                               tokenizer = tokenizer)

# Train
trainer.train()

Some weights of XLMRobertaModel were not initialized from the model checkpoint at bhavikardeshna/xlm-roberta-base-vietnamese and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
17,2.1695
34,1.4526
51,0.9427
68,0.7458
85,0.6261


TrainOutput(global_step=85, training_loss=1.1873456842759076, metrics={'train_runtime': 2277.3866, 'train_samples_per_second': 0.288, 'train_steps_per_second': 0.037, 'total_flos': 85574687831040.0, 'train_loss': 1.1873456842759076, 'epoch': 5.0})

In [None]:
# PARAMETERS
print(f'DATASET PATH: {dataset_dir}')
print(f'DATASETDICT: {qa_data}')
print()
print(f'MODEL USED: {model_checkpoint}')
print(f'MODEL ARCHITECTURE: {config}')
print(f'FINE TUNED PATH: {finetuned_model_dir}')
print()
print(f'TEST SIZE: {test_size}')
print(f'MAX LENGTH: {max_length}')
print(f'STRIDE: {stride}')
print(f'BATCH SIZE: {batch_size}')
print(f'LEARNING RATE: {learning_rate}')
print(f'EPOCHS: {epochs}')
print(f'MAX NORM: ', {max_norm})

DATASET PATH: dataset/List_cau_hoi-Phan_tich_doanh_nghiep-Short.xlsx
DATASETDICT: DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'question', 'answer'],
        num_rows: 99
    })
    validation: Dataset({
        features: ['id', 'context', 'question', 'answer'],
        num_rows: 6
    })
})

MODEL USED: bhavikardeshna/xlm-roberta-base-vietnamese
MODEL ARCHITECTURE: XLMRobertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.31.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 250002
}

FINE TUNED PATH: model/x

# **Upload model**

In [None]:
!pip install huggingface_hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
QA_model.push_to_hub('xlm-roberta-base-vn-dplat')
tokenizer.push_to_hub('xlm-roberta-base-vn-dplat')

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/dantern/xlm-roberta-base-vn-dplat/commit/cbb84cb5d465a75dbcf17f45be4abda3fb6cbb06', commit_message='Upload tokenizer', commit_description='', oid='cbb84cb5d465a75dbcf17f45be4abda3fb6cbb06', pr_url=None, pr_revision=None, pr_num=None)

# **Inference**

In [None]:
from transformers import pipeline

question_answerer = pipeline('question-answering', model = QA_model, tokenizer = tokenizer)

question = 'Tốc độ tăng trưởng doanh thu Q4/2022 của NT2 là bao nhiêu?'
context = knowledge['NT2 – Cổ phiếu phòng thủ với tăng trưởng ổn định']['KQKD 2022: Trích lập dự phòng liên tục ảnh hưởng đến lợi nhuận']
question_answerer(question=question, context=context)

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'


{'score': 0.06998337060213089,
 'start': 110,
 'end': 142,
 'answer': ' tăng trưởng sản lượng 20% svck.'}

In [None]:
question = 'Việt Nam kết thúc giãn cách xã hội ảnh hưởng thế nào đến BVH?'
context = knowledge['BVH – Triển vọng 2023 vẫn rất tích cực']['Mảng BH phi nhân thọ năm 2022: biên lợi nhuận tốt hơn dự kiến']
question_answerer(question=question, context=context)

{'score': 0.06270157545804977,
 'start': 393,
 'end': 409,
 'answer': ' kết quả đáng nể'}