## Preprocessing WikiWeb2M

In [3]:
import numpy as np
import glob
import tensorflow.compat.v1 as tf
from collections import defaultdict

class DataParser():
  def __init__(self,
               path: str,
               filepath: str = 'wikiweb2m-*'):
    self.filepath = filepath
    self.path = path
    self.data = defaultdict(list)

  def parse_data(self):
    context_feature_description = {
        'split': tf.io.FixedLenFeature([], dtype=tf.string),
        'page_title': tf.io.FixedLenFeature([], dtype=tf.string),
        'page_url': tf.io.FixedLenFeature([], dtype=tf.string),
        'clean_page_description': tf.io.FixedLenFeature([], dtype=tf.string),
        'raw_page_description': tf.io.FixedLenFeature([], dtype=tf.string),
        'is_page_description_sample': tf.io.FixedLenFeature([], dtype=tf.int64),
        'page_contains_images': tf.io.FixedLenFeature([], dtype=tf.int64),
        'page_content_sections_without_table_list': tf.io.FixedLenFeature([] , dtype=tf.int64)
    }

    sequence_feature_description = {
        'is_section_summarization_sample': tf.io.VarLenFeature(dtype=tf.int64),
        'section_title': tf.io.VarLenFeature(dtype=tf.string),
        'section_index': tf.io.VarLenFeature(dtype=tf.int64),
        'section_depth': tf.io.VarLenFeature(dtype=tf.int64),
        'section_heading_level': tf.io.VarLenFeature(dtype=tf.int64),
        'section_subsection_index': tf.io.VarLenFeature(dtype=tf.int64),
        'section_parent_index': tf.io.VarLenFeature(dtype=tf.int64),
        'section_text': tf.io.VarLenFeature(dtype=tf.string),
        'section_clean_1st_sentence': tf.io.VarLenFeature(dtype=tf.string),
        'section_raw_1st_sentence': tf.io.VarLenFeature(dtype=tf.string),
        'section_rest_sentence': tf.io.VarLenFeature(dtype=tf.string),
        'is_image_caption_sample': tf.io.VarLenFeature(dtype=tf.int64),
        'section_image_url': tf.io.VarLenFeature(dtype=tf.string),
        'section_image_mime_type': tf.io.VarLenFeature(dtype=tf.string),
        'section_image_width': tf.io.VarLenFeature(dtype=tf.int64),
        'section_image_height': tf.io.VarLenFeature(dtype=tf.int64),
        'section_image_in_wit': tf.io.VarLenFeature(dtype=tf.int64),
        'section_contains_table_or_list': tf.io.VarLenFeature(dtype=tf.int64),
        'section_image_captions': tf.io.VarLenFeature(dtype=tf.string),
        'section_image_alt_text': tf.io.VarLenFeature(dtype=tf.string),
        'section_image_raw_attr_desc': tf.io.VarLenFeature(dtype=tf.string),
        'section_image_clean_attr_desc': tf.io.VarLenFeature(dtype=tf.string),
        'section_image_raw_ref_desc': tf.io.VarLenFeature(dtype=tf.string),
        'section_image_clean_ref_desc': tf.io.VarLenFeature(dtype=tf.string),
        'section_contains_images': tf.io.VarLenFeature(dtype=tf.int64)
    }

    def _parse_function(example_proto):
      return tf.io.parse_single_sequence_example(example_proto,
                                                 context_feature_description,
                                                 sequence_feature_description)

    suffix = '.tfrecord*'

    data_path = glob.glob(self.path + self.filepath + suffix)
    raw_dataset = tf.data.TFRecordDataset(data_path, compression_type='GZIP')
    parsed_dataset = raw_dataset.map(_parse_function)

    for d in parsed_dataset:
      split = d[0]['split'].numpy().decode()
      self.data[split].append(d)


In [4]:
parser = DataParser(path='data/wikiweb2m/')
parser.parse_data()
print((len(parser.data['test'])))

2023-11-27 21:52:34.295684: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


100833


In [20]:
parser.data['test'][0][0]

{'clean_page_description': <tf.Tensor: shape=(), dtype=string, numpy=b'Christopher Razis is a Cypriot/Greek professional basketball player for Keravnos of the Cypriot League. He is a 1.94 m tall combo guard.'>,
 'is_page_description_sample': <tf.Tensor: shape=(), dtype=int64, numpy=1>,
 'page_contains_images': <tf.Tensor: shape=(), dtype=int64, numpy=1>,
 'page_content_sections_without_table_list': <tf.Tensor: shape=(), dtype=int64, numpy=2>,
 'page_title': <tf.Tensor: shape=(), dtype=string, numpy=b'Christopher Razis'>,
 'page_url': <tf.Tensor: shape=(), dtype=string, numpy=b'http://en.wikipedia.org/wiki/Christopher_Razis'>,
 'raw_page_description': <tf.Tensor: shape=(), dtype=string, numpy=b'Christopher Razis is a Cypriot/Greek professional basketball player for Keravnos of the Cypriot League. He is a 1.94 m tall combo guard.'>,
 'split': <tf.Tensor: shape=(), dtype=string, numpy=b'test'>}

In [24]:
for key in parser.data['test'][0][0].keys():
    print(f"Key: {key}")
    print(f"Value: {parser.data['test'][0][0][key]}")
    print("\n")
for key in parser.data['test'][0][1].keys():
    print(f"Key: {key}")
    print(f"Value: {parser.data['test'][0][1][key]}")
    print("\n")


Key: clean_page_description
Value: b'Christopher Razis is a Cypriot/Greek professional basketball player for Keravnos of the Cypriot League. He is a 1.94 m tall combo guard.'


Key: is_page_description_sample
Value: 1


Key: page_contains_images
Value: 1


Key: page_content_sections_without_table_list
Value: 2


Key: page_title
Value: b'Christopher Razis'


Key: page_url
Value: b'http://en.wikipedia.org/wiki/Christopher_Razis'


Key: raw_page_description
Value: b'Christopher Razis is a Cypriot/Greek professional basketball player for Keravnos of the Cypriot League. He is a 1.94 m tall combo guard.'


Key: split
Value: b'test'


Key: is_image_caption_sample
Value: SparseTensor(indices=tf.Tensor([[0 0]], shape=(1, 2), dtype=int64), values=tf.Tensor([1], shape=(1,), dtype=int64), dense_shape=tf.Tensor([3 1], shape=(2,), dtype=int64))


Key: is_section_summarization_sample
Value: SparseTensor(indices=tf.Tensor(
[[0 0]
 [1 0]
 [2 0]], shape=(3, 2), dtype=int64), values=tf.Tensor([0 1 0], sh

In [52]:
from tqdm import tqdm
def process_data_tf(parsed_dataset):
    documents = {}
    queries = []
    for i, d in tqdm(enumerate(parsed_dataset), total=len(parsed_dataset)):
        context = d[0]
        sequence = d[1]
        image_urls = [url.decode() for url in sequence['section_image_url'].values.numpy()]
        combined_clean_text = ' '.join([text.decode() for text in sequence['section_text'].values.numpy()])
        image_captions = [caption.decode() for caption in sequence['section_image_captions'].values.numpy()]
        document = {
            k: v.numpy().decode() if v.dtype == np.dtype('O') else v.numpy() for k, v in context.items()
        }
        doc_id = f"{document['split']}_{i}"
        document.update({
            'text': combined_clean_text,
            'image_urls': image_urls,
            'image_captions': image_captions,
        })
        doc_id = f"{document['split']}_{i}"
        documents[doc_id] = document
        query = {
            'query': document['page_title'],
            'ground_truth_doc_id': doc_id
        }
        queries.append(query)
    return documents, queries

In [53]:
documents, queries = process_data_tf(parser.data['test'][:10])

100%|██████████| 10/10 [00:00<00:00, 9727.05it/s]
100%|██████████| 10/10 [00:00<00:00, 9727.05it/s]


In [51]:
print(documents['test_0'])
print(queries[0])

{'clean_page_description': 'Christopher Razis is a Cypriot/Greek professional basketball player for Keravnos of the Cypriot League. He is a 1.94 m tall combo guard.', 'is_page_description_sample': 1, 'page_contains_images': 1, 'page_content_sections_without_table_list': 2, 'page_title': 'Christopher Razis', 'page_url': 'http://en.wikipedia.org/wiki/Christopher_Razis', 'raw_page_description': 'Christopher Razis is a Cypriot/Greek professional basketball player for Keravnos of the Cypriot League. He is a 1.94 m tall combo guard.', 'split': 'test', 'text': 'Christopher Razis (alternate spelling: Christophoros) (Greek: Χριστόφορος Ράζης; born 7 July 1989) is a Cypriot/Greek professional basketball player for Keravnos of the Cypriot League. He is a 1.94 m (6\xa0ft 4.25 in) tall combo guard. Razis played youth basketball with Keravnos at Cyprus where he started his pro career in 2005. he stayed at the club until 2010, when he was awarded the Rookie of the Year in the Cypriot League. The foll

## BM25 - Get top 30 documents for each query

In [1]:
import json

with open('data/wikiweb2m/test_documents.json', 'r') as f:
    documents = json.load(f)

with open('data/wikiweb2m/test_queries.json', 'r') as f:
    queries = json.load(f)

In [None]:
from rank_bm25 import BM25Okapi

# Get the text of the documents
doc_texts = [doc['text'] for doc in documents.values()]

# Tokenize the documents
tokenized_docs = [doc.split(" ") for doc in doc_texts]

# Create a BM25 object and initialize it with the tokenized documents
bm25 = BM25Okapi(tokenized_docs)

# For each query, get the top N documents
N = 10
for query in queries:
    query_text = query['query']
    tokenized_query = query_text.split(" ")
    scores = bm25.get_scores(tokenized_query)
    top_n = bm25.get_top_n(tokenized_query, doc_texts, n=N)
    print(f"Top {N} documents for query '{query_text}': {top_n}")