<a href="https://colab.research.google.com/github/dauvannam321/Text_Retrieval/blob/main/Project_Text_Retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Setup

## Import and read corpus

In [None]:
from google.colab import output

In [None]:
# link: https://drive.google.com/file/d/1ydGNBdRVloX9rtxsKrMSnUNFG43Qv1sl/view?usp=sharing
!gdown --id 1ydGNBdRVloX9rtxsKrMSnUNFG43Qv1sl
!unzip news_corpus.zip
output.clear()

## Define vectorize text function

In [None]:
# download vietnamese stopwords: https://raw.githubusercontent.com/stopwords/vietnamese-stopwords/master/vietnamese-stopwords.txt
!gdown --id 1W9zVRz--bHlbBXbCSmoWHBO_2Cs4EhPY
!unzip vn_stopwords.zip

Downloading...
From: https://drive.google.com/uc?id=1W9zVRz--bHlbBXbCSmoWHBO_2Cs4EhPY
To: /content/vn_stopwords.zip
100% 6.89k/6.89k [00:00<00:00, 12.3MB/s]
Archive:  vn_stopwords.zip
  inflating: vietnamese-stopwords.txt  


In [None]:
import string
import os
from tqdm import tqdm
import numpy as np

def remove_punctuations(text: str) -> str:
  return text.translate(str.maketrans('', '', string.punctuation))

with open('vietnamese-stopwords.txt', 'r', encoding='utf8') as f:
  vn_stopwords = f.readlines()
def remove_stopwords(text: str) -> str:
  new_text = text
  for w in vn_stopwords:
    if w in new_text:
      new_text = new_text.replace(w, '')

  return new_text

def normalize_text(text: str) -> str:
  normalized_text = text.lower()
  normalized_text = remove_punctuations(normalized_text)
  normalized_text = remove_stopwords(normalized_text)

  return normalized_text

## Create vectorize function using bag-of-words on a provided vocab

In [None]:
def vectorize(text: str, vocab: list) -> np.ndarray:
  normalized_text = normalize_text(text)
  vec = []
  for word in vocab:
    vec.append(normalized_text.count(word))
  return np.array(vec)

# 2. Building document-term matrix



## 2.1. Create vocab

In [None]:
doc_lists = []
vocab = []
dataset_root_path = 'news_corpus'
filenames = os.listdir(dataset_root_path)
for i in tqdm(range(len(filenames) // 20)):
  filename = filenames[i]
  filepath = os.path.join(dataset_root_path, filename)
  with open(filepath, 'r', encoding='utf8') as f:
    lines = f.readlines()
    title = lines[0].strip()
    article = ' '.join(lines[1:]).strip()
    article = normalize_text(article)
    if (title, article) not in doc_lists:
      doc_lists.append((title, article))
    tokens = article.split(' ')
    for token in tokens:
      if token not in vocab:
        vocab.append(token)

100%|██████████| 9226/9226 [02:54<00:00, 52.80it/s]


## 2.2. Create document-term matrix

In [None]:
term_document_matrix = {}
for (title, article) in tqdm(doc_lists):
  vec = vectorize(article, vocab)
  term_document_matrix[(title, article)] = vec

100%|██████████| 9224/9224 [16:52<00:00,  9.11it/s]


#3. Ranking

## Create similiarity measurement function (l1/l2 norm, cosine similarity)

In [None]:
def distance(a: np.ndarray, b: np.ndarray) -> np.ndarray:
  numerator = np.dot(a,b)
  denominator = np.linalg.norm(a) * np.linalg.norm(b)

  return numerator / denominator

## Create ranking function that will calculate similarity between query and each document then sort the results

In [None]:
def ranking(query: str, term_document_matrix: dict, print_top_10: bool = True) -> list:
  query_vec = vectorize(normalize_text(query), vocab=vocab)
  rankings = []
  for doc_info, vec in tqdm(term_document_matrix.items()):
    score = distance(query_vec,vec)
    rankings.append([score,(doc_info[0])])
  rankings.sort(reverse=True)
  print("\n")
  if print_top_10 == True:
    for rank in rankings[:10]:
      print(rank)

  return rankings

In [None]:
query = "điểm thi đại học"
rankings = ranking(query, term_document_matrix, True)

100%|██████████| 9224/9224 [00:04<00:00, 1960.08it/s]




[0.9736109723711444, "Tổ hợp KHXH: Lịch sử 'thăng hạng' với gần 1.800 bài thi đạt điểm 10"]
[0.970495805386389, 'Công bố điểm thi tốt nghiệp THPT 2022 chính thức']
[0.9703662987772426, 'Hà Nội, TPHCM dẫn đầu cả nước về số điểm 10 môn Toán thi tốt nghiệp 2022 | Báo Dân trí']
[0.9703002725671075, 'Điểm sàn Đại học Quốc gia Hà Nội 2022 bao nhiêu?']
[0.9701758322120065, 'Thí sinh mắc Covid-19 từ chối đặc cách | Báo Dân trí']
[0.9698769246600187, 'Đạt 28.7 điểm, thí sinh Hải Phòng, Hà Tĩnh là đồng thủ khoa khối D1 | Báo Dân trí']
[0.9693740750397898, 'Thi tốt nghiệp THPT 2022: Điểm học bạ cao hơn điểm thi']
[0.9692245104202254, 'Thi tốt nghiệp THPT 2022: Cách tính điểm có gì đặc biệt?']
[0.9691823487923342, 'Nam sinh Bắc Giang là thủ khoa kỳ thi đánh giá tư duy Đại học Bách khoa Hà Nội']
[0.9689996694430342, 'Xuất hiện điểm tuyệt đối tại kỳ thi tuyển sinh lớp 10 ở TP.HCM']
