<a href="https://colab.research.google.com/github/duonghiepit/Review-Text-Retrival/blob/main/Review_Text_Retrival.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1.Download MS MARCO Dataset

In [None]:
!pip install datasets==2.13.1

In [None]:
from datasets import load_dataset

dataset = load_dataset('ms_marco', 'v1.1')

In [30]:
subset = dataset['test']

In [31]:
queries_infos = []
queries = []
corpus = []

for sample in subset:
  query_type = sample['query_type']
  if query_type != 'entity':
    continue
  query_id = sample['query_id']
  query_str = sample['query']
  passages_dict = sample['passages']
  is_selected_lst = passages_dict['is_selected']
  passage_text_lst = passages_dict['passage_text']
  query_info = {
      'query_id': query_id,
      'query': query_str,
      'relevant_docs': []
  }
  current_len_corpus = len(corpus)
  for idx in range(len(is_selected_lst)):
    if is_selected_lst[idx] == 1:
      doc_idx = current_len_corpus + idx
      query_info['relevant_docs'].append(doc_idx)

  if query_info['relevant_docs'] == []:
    continue

  queries.append(query_str)
  queries_infos.append(query_info)
  corpus += passage_text_lst

# 2.Text Normalization

In [32]:
def tokenize(text):
  return text.split()

In [33]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
english_stopwords = stopwords.words('english')
remove_chars = string.punctuation
stemmer = PorterStemmer()

print(english_stopwords)
print(remove_chars)
print(stemmer)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:
def text_normalize(text):
  text = text.lower()
  for char in remove_chars:
    text = text.replace(char, '')
  text = ' '.join([word for word in tokenize(text) if word not in english_stopwords])
  text = ' '.join([stemmer.stem(word) for word in tokenize(text)])

  return text

In [35]:
test_text = 'Viet Nam is so beautiful. Girl in Viet Nam is so hot'
text_normalize(test_text)

'viet nam beauti girl viet nam hot'

# 3.Create Dictionary

In [36]:
def create_dictionary(corpus):
  dictionary = []
  for doc in corpus:
    normalized_doc = text_normalize(doc)
    tokens = tokenize(normalized_doc)
    for token in tokens:
      if token not in dictionary:
        dictionary.append(token)

  return dictionary

In [37]:
%%time
dictionary = create_dictionary(corpus)

CPU times: user 28.2 s, sys: 75.4 ms, total: 28.3 s
Wall time: 28.8 s


In [38]:
corpus[:2]

['SUBPHYLUM CHELICERATA, CLASS ARACHNIDA. Spiders. This group contains many familiar organisms, including the spiders, mites, scorpions and ticks. Examine the large spider on the right. Again, notice that there are two body regions, a cephalothorax and an abdomen. On the cephalothorax are two to four pairs of simple eyes.',
 'The class Arachnida includes a diverse group of arthropods: spiders, scorpions, ticks, mites, harvestmen, and their cousins. Scientists describe over 75,000 species of arachnids, the majority of them spiders. Most arachnids are carnivorous, typically preying on insects, and terrestrial, living on land.']

# 4. Create Doc-Term Matrix

In [39]:
def vectorize(text, dictionary):
  word_count_dict = {word: 0 for word in dictionary}
  normalized_text = text_normalize(text)
  tokens = tokenize(normalized_text)
  for token in tokens:
    try:
      word_count_dict[token] += 1
    except:
      pass

  vector = list(word_count_dict.values())

  return vector

In [40]:
def create_doc_term_matrix(corpus, dictionary):
  doc_term_matrix = {}
  for idx, doc in enumerate(corpus):
    vector = vectorize(doc, dictionary)
    doc_term_matrix[(doc, idx)] = vector

  return doc_term_matrix

In [42]:
doc_term_matrix = {}
for idx, doc in enumerate(corpus[:1]):
    vector = vectorize(doc, dictionary)
    doc_term_matrix[(doc, idx)] = vector
doc_term_matrix

{('SUBPHYLUM CHELICERATA, CLASS ARACHNIDA. Spiders. This group contains many familiar organisms, including the spiders, mites, scorpions and ticks. Examine the large spider on the right. Again, notice that there are two body regions, a cephalothorax and an abdomen. On the cephalothorax are two to four pairs of simple eyes.',
  0): [1,
  1,
  1,
  1,
  3,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  1,
  1,
  2,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0

In [43]:
%%time
doc_term_matrix = create_doc_term_matrix(corpus, dictionary)

CPU times: user 32.6 s, sys: 1.65 s, total: 34.3 s
Wall time: 39.7 s


# 5.Ranking

In [44]:
from scipy import spatial

def similarity(a, b):
  return 1 - spatial.distance.cosine(a, b)

In [45]:
def ranking(query, dictionary, doc_term_matrix):
  query_vec = vectorize(query, dictionary)
  scores = []
  for doc_info, doc_vec in doc_term_matrix.items():
    sim = similarity(query_vec, doc_vec)
    scores.append((sim, doc_info))
  scores.sort(reverse = True)

  return scores

In [52]:
query_lst = ['what is the official language in Fiji']
top_k = 10

for query in query_lst:
  scores = ranking(query, dictionary, doc_term_matrix)
  print(f'Query: {query}')
  print('=== Relevant docs ===')
  for idx in range(top_k):
    doc_score = scores[idx][0]
    doc_content = scores[idx][1][0]

    print(f'Top {idx + 1}; Score: {doc_score:.4f}')
    print(doc_content)
    print('\n')

Query: what is the official language in Fiji
=== Relevant docs ===
Top 1; Score: 0.6556
The official languages in Fiji are Fijian and English. A dialect of Hindustani is also widely spoken among Indo-Fijians.  _________________________________________   T … he official and everyday language of Fiji is English. Fijian and Fiji-Hindi are second languages in the island nation.


Top 2; Score: 0.6556
The official languages in Fiji are Fijian and English. A dialect of Hindustani is also widely spoken among Indo-Fijians.  _________________________________________   T … he official and everyday language of Fiji is English. Fijian and Fiji-Hindi are second languages in the island nation.


Top 3; Score: 0.5715
The official languages. Fiji’s 1997 Constitution established Fijian as one of the official languages of the country. Fijian is an Austronesian language, a grouping that includes thousands of other languages spanning the globe. The language is of the Malayo-Polynesian family, not too diff