## load original documents & HP-related index

In [1]:
def read_file_as_documents(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    # Splitting the content by newlines
    documents = content.split('\n')
    # Removing empty strings if any
    documents = [doc for doc in documents if doc.strip() != '']
    return documents

# obtained from huggingface wikitext 103
file_path = 'huggingface_wikitext_wikitext-103-v1_train_en_en_document.txt'
documents = read_file_as_documents(file_path)
len(documents)

1165029

In [2]:

import json
file_path = 'HP_related_index_for_wikitext-103.json'

# Open the JSON file
with open(file_path, "r") as f:
  # Parse the JSON data and store it in a variable
  data = json.load(f)

# Access data using dictionary 
print(len(data["HP-related-index"]))  


4358


## save the related/unrelated texts

In [3]:
def save_lines_to_file(document, filename):
  with open(filename, 'w', encoding='utf-8') as file:
    lines = document    
    # Write each line with a newline character at the end
    for line in lines:
      file.write(f"{line}\n")

def get_subset_by_index(data_list, index_list):
  # Check if indices are within valid range
  if any(index < 0 or index >= len(data_list) for index in index_list):
    raise IndexError("Invalid index provided. Indices must be within the range of the list.")

  # Use list comprehension for concise selection
  return [data_list[i] for i in index_list]

def get_difference(list_a, list_b):
  return list(set(list_a) - set(list_b))

In [4]:
fname= "hp-wikitext-103.txt"

subset_doc = get_subset_by_index(documents,data["HP-related-index"] )
save_lines_to_file(subset_doc,fname )
len(subset_doc)

4358

In [5]:
unrelated_index= get_difference(range(len(documents)), data["HP-related-index"])
print(len(unrelated_index))
fname= "hp-unrelated-wikitext-103.txt"

subset_doc = get_subset_by_index(documents, unrelated_index)
save_lines_to_file(subset_doc,fname )
len(subset_doc)

1160671


1160671

In [6]:
## verify 
fname= "hp-wikitext-103.txt"
loaded_subset_doc = read_file_as_documents(fname)
len(loaded_subset_doc)

8716

In [7]:
## verify 
fname= "hp-unrelated-wikitext-103.txt"
loaded_subset_doc = read_file_as_documents(fname)
len(loaded_subset_doc)

3486371

## examine the related texts

In [17]:
from IPython.display import HTML

def display_string(long_string):
    html_string = "<pre style='white-space: pre-wrap; overflow-x: auto;'>" + long_string + "</pre>"
    display(HTML(html_string))

for  indice in data["HP-related-index"][:5]:
    display_string(documents[indice])

In [20]:
for indice in data["HP-related-index"][-10:]:
    display_string(documents[indice])

    