In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/customized-knowledge-qa
!pip install python-docx faiss-cpu transformers accelerate

# **Prepare data**

Split doc

In [4]:
from docx import Document

doc_dir = 'knowledge/Template_phan_tich_doanh_nghiep.docx'
doc = Document(doc_dir)

titles = []
articles = []

current_title = None
current_article = None

for paragraph in doc.paragraphs:
    # Retrieve the text without formatting
    text = paragraph.text.strip()

    # Reset if empty line
    if not text:
        if current_title != None:
            titles.append(current_title)
            articles.append(current_article)
        current_title = None
        current_article = None
    # Check if the paragraph is a new report's title
    elif not current_title:
        current_title = text
        current_article = '===' + current_title.upper() + '===\n'
    # Check if the paragraph is a paragraph's subtitle
    elif paragraph.runs[0].bold:
        current_article += '===' + text.upper() + '===\n'
    # Check if the paragraph is a new paragraph's content
    else:
        if not current_article:
             current_article = '===' + current_title.upper() + '===\n'
        current_article += text + '\n'

Chunk articles

In [5]:
print('Before splitting, {:,} articles.\n'.format(len(titles)))

chunked_corpus = {'title': [], 'text': []}

for i in range(len(titles)):
    title = titles[i]
    article = articles[i]

    # Skip empty articles
    if len(article) == 0:
        continue

    words = article.split()

    # Chunk articles to 100 words
    for i in range(0, len(words), 100):
        chunk_words = words[i : i + 100]
        chunk = " ".join(chunk_words)
        chunk = chunk.strip() # Remove trailing whitespace

        # Skip empty chunks
        if len(chunk) == 0:
            continue

        chunked_corpus['title'].append(title)
        chunked_corpus['text'].append(chunk)

print('After splitting, {:,} passages.\n'.format(len(chunked_corpus['title'])))

Before splitting, 19 articles.

After splitting, 92 passages.



# **Create DPR Embeddings**

Tokenization

In [6]:
from transformers import DPRContextEncoderTokenizerFast

ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained('facebook/dpr-ctx_encoder-multiset-base')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/492 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.


In [7]:
num_passages = len(chunked_corpus['title'])

print('Tokenizing {:,} passages for DPR'.format(num_passages))

# Tokenize dataset
outputs = ctx_tokenizer(
    chunked_corpus['title'],
    chunked_corpus['text'],
    truncation=True,
    padding='longest',
    return_tensors='pt',
)

input_ids = outputs['input_ids'] # encoded tokens for the entire corpus

Tokenizing 92 passages for DPR


In [8]:
print(input_ids.shape)

torch.Size([92, 263])


Encoding (Generate Embeddings)

In [9]:
import torch

if torch.cuda.is_available():
    device = torch.device('cuda')

In [10]:
from transformers import DPRContextEncoder

ctx_encoder = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-multiset-base')
ctx_encoder = ctx_encoder.to(device=device)

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-ctx_encoder-multiset-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
import time
import datetime

def format_time(elapsed):
    # Takes a time in seconds and returns a string hh:mm:ss
    elapsed_rounded = int(round(elapsed))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [12]:
import time
import math

# Run model forward only, non need for gradients
torch.set_grad_enabled(False)

# Track elapsed time for progress updates
t0 = time.time()

# Track the current batch number, also for progress updates
step = 0

# How many passages to process per batch
batch_size = 16

# Get the number of passages in the dataset
num_passages = input_ids.size()[0]

# Calculate the numer of batches in dataset
num_batches = math.ceil(num_passages / batch_size)

# Stores embedded passages from batches
embeds_batches = []

print('Generating embeddings for {:,} passages'.format(num_passages))

for i in range(0, num_passages, batch_size):
    # Progress update every 100 batches
    if step % 100 == 0 and not step == 0:
        # Calculate elapsed time in minutes
        elapsed = format_time(time.time() - t0)
        # Report progress
        print('     Batch {:>5,} of {:>5,}. Elapsed: {:}'.format(step, num_batches, elapsed))

    # Select the next batch
    batch_ids = input_ids[i : i + 16, :]

    # Move to GPU
    batch_ids = batch_ids.to(device)

    # Run encoder
    outputs = ctx_encoder(
        batch_ids,
        return_dict=True
    )

    # Embeddings are in field pooler_output
    embeddings = outputs['pooler_output']

    # Bring embeddings back from GPU nad convert to numpy
    embeddings = embeddings.detach().cpu().numpy()

    embeds_batches.append(embeddings)

    step += 1

Generating embeddings for 92 passages


In [13]:
import numpy as np

# Combine results across all batches
embeddings = np.concatenate(embeds_batches, axis=0)

print('Size of dataset embeddings:', embeddings.shape)

Size of dataset embeddings: (92, 768)


# **FAISS Index**

In [14]:
import faiss

# Dimension of embeddings to pass HNSW Faiss index
dim = 768

# Number of bi-directional links created for every new element during HNSW index construction
m = 128

# Faiss implementation of HNSW for fast approximate nearest neighbor search
index = faiss.IndexHNSWFlat(dim, m, faiss.METRIC_INNER_PRODUCT)

In [15]:
print('Building FAISS index')

# Track elapsed time for progress updates
t0 = time.time()

index.train(embeddings)
index.add(embeddings)

print('Adding embeddings to index took', format_time(time.time() - t0))

Building FAISS index
Adding embeddings to index took 0:00:00


Example search

In [16]:
from transformers import DPRQuestionEncoder

q_encoder = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-multiset-base')
q_encoder = q_encoder.to(device=device)

Downloading (…)lve/main/config.json:   0%|          | 0.00/493 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-question_encoder-multiset-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
from transformers import DPRQuestionEncoderTokenizerFast

q_tokenizer = DPRQuestionEncoderTokenizerFast.from_pretrained('facebook/dpr-question_encoder-multiset-base')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [18]:
# Tokenize question
input_ids = q_tokenizer.encode("Nguyên nhân doanh thu Q4/22 của PLX tăng so với cùng kỳ?", return_tensors='pt')

# Move question to GPU
input_ids = input_ids.to(device)

# Run questioin through BERT and generate question embedding
outputs = q_encoder(input_ids)

# Embedding is stored in pooler_output property
q_embed = outputs['pooler_output']

# FAISS index on CPU so transfer question embedding to CPU to search
q_embed = q_embed.cpu().numpy()

print('Query embedding:', q_embed.shape)

Query embedding: (1, 768)


In [19]:
# Find the k=3 most similar passages to question embedding
D, I = index.search(q_embed, k=3)

print('Closest matching indices:', I)
print('Inner products:', D)

Closest matching indices: [[41 65 70]]
Inner products: [[80.5873   80.3647   80.275856]]


In [24]:
import textwrap

# Wrap text to 80 characters
wrapper = textwrap.TextWrapper(width=80)

for i in I[0]:
    print('Index:', i)

    # Retrieve passage and its title
    title = chunked_corpus['title'][i]
    passage = chunked_corpus['text'][i]

    print('Article title:', title, '\n')
    print('Passage:', wrapper.fill(passage), '\n\n')

Index: 41
Article title: PVT – Lợi nhuận cốt lõi vững đà tăng trưởng 

Passage: gộp Q4/22 tăng 2,2 điểm % svck lên 18%. Tuy nhiên, LN ròng Q4/22 chỉ tăng nhẹ
3,8% svck lên 206,8 tỷ đồng do: (1) chi phí lãi vay tăng 94% svck và (2) chi phí
QLDN tăng 62% svck. Cả năm 2022, DT tăng 21,3% svck lên 9.047,5 tỷ đồng và LN
ròng tăng 30,6% svck lên 861,2 tỷ đồng nhờ: (1) giá cước vận tải tàu chở
dầu/nhiên liệu cao hơn, (2) đóng góp của các tàu mua mới (9 tàu các loại) và (3)
thu nhập bất thường từ thanh lý tàu chở dầu cũ. ===LỢI NHUẬN 


Index: 65
Article title: PVD – Giai đoạn khó khăn nhất đã qua 

Passage: mức cao nhất trong nhiều năm qua. Điều này làm giảm tác động tiêu cực của chi
phí QLDN (+104% svck lên 178 tỷ đồng) và chi phí tài chính ròng (+3,5 lần svck
lên 52 tỷ đồng) tăng cao. Kết quả, LN ròng Q4/22 tăng nhẹ 7,3% svck lên 53,9 tỷ
đồng, là quý có LN dương đầu tiên sau ba quý thua lỗ liên tiếp. Trong năm 2022,
PVD ghi nhận DT tăng 36% svck lên 5.432 tỷ đồng và lỗ ròng 98,6 tỷ đồng. ==