# Context creation using Wikipedia

We downloaded the 13GB Wikipedia Plaintext (2023-07-01) dataset from Kaggle. The wikipedia articles are stored in parquet files. We use only the wiki_2023_index.parquet file that contains the first sentences of the articles as context for the mdel. Then we use the Sentence Transformer library to embed the wikipedia articles and then used Faiss to create an index of the embeddings. We then used the index to retrieve the most similar wikipedia article for each question.

## Sources

* https://www.kaggle.com/datasets/jjinho/wikipedia-20230701/data?select=h.parquet

* https://github.com/facebookresearch/faiss/wiki

In [None]:
!pip install kaggle
!pip install datasets
!pip install faiss-gpu sentence-transformers

In [None]:
from google.colab import files

# Upload kaggle.json file to google drive
uploaded = files.upload()

In [None]:
# Create kaggle directory
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
import kaggle

# Specify the Kaggle dataset we want to download
dataset_name = 'jjinho/wikipedia-20230701'

# Download the specific file and unzip it
kaggle.api.authenticate()
kaggle.api.dataset_download_files(dataset_name, path='.', unzip=True)

In [None]:
# Importing the libraries
import os
import pandas as pd
from datasets import load_dataset
import faiss
from sentence_transformers import SentenceTransformer
import torch

In [None]:
#IMportant parameters describing the code
SIM_MODEL = 'all-MiniLM-L6-v2'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
#Loading the questions
qna_df = pd.read_csv("https://raw.githubusercontent.com/csabi0312/DeepLProject/main/train.csv",index_col=0)

qna_df.head()

In [None]:
# Load Parquet files into a Hugging Face dataset
# Source: https://www.kaggle.com/datasets/jjinho/wikipedia-20230701/data?select=wiki_2023_index.parquet
wiki_dataset = load_dataset('parquet', data_files={'train': 'wiki_2023_index.parquet'}, split='train') # 1.76GB file

In [None]:
wiki_dataset

In [None]:
# Load pre-trained sentence transformer model
model = SentenceTransformer(SIM_MODEL)
# Create a Faiss index
index = faiss.IndexFlatIP(context_embeddings.shape[1])

# Define batch size
batch_size = 500_000

# Iterate over the dataset in batches
for i in range(0, len(wiki_dataset['context']), batch_size):
    # Encode the context sentences using the SentenceTransformer model
    context_embeddings = model.encode(wiki_dataset['context'][i:i+batch_size],
                                      device=DEVICE,
                                      show_progress_bar=True,
                                      convert_to_tensor=True,
                                      normalize_embeddings=True).half()  # Use mixed-precision training (FP16) to reduce memory footprint

    # Convert the embeddings to a numpy array
    context_embeddings_np = context_embeddings.detach().cpu().numpy()
    context_embeddings_np = context_embeddings_np.astype('float32')

    # Add the embeddings to the Faiss index
    index.add(context_embeddings_np)

    # Free up memory
    del context_embeddings, context_embeddings_np

# Function to retrieve most similar documents
def retrieve_most_similar(query, k=20):
    query_embedding = model.encode(query, device=DEVICE, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)
    query_embedding = query_embedding.reshape(1, -1)  # Reshape for Faiss
    query_embedding = query_embedding.detach().cpu().numpy()
    _, idx = index.search(query_embedding, k)
    return idx[0]

# Example usage
query_text = qna_df['prompt'][0]
print(f'example prompt {query_text}')
similar_documents_indices = retrieve_most_similar(query_text)

# Print similar documents
for idx in similar_documents_indices:
    print(wiki_dataset[int(idx)]['context'])

In [None]:
# Print similar documents
for idx in similar_documents_indices:
    print(wiki_dataset[int(idx)]['text'])

In [None]:
# Create the context column from the wikipedia article
# Create an empty list to store the context for each prompt
context_list = []

# Loop through each prompt in the qna_df dataframe
for i in range(len(qna_df)):
    query_text = qna_df['prompt'][i]
    similar_documents_indices = retrieve_most_similar(query_text)

    # Get the first answer from the corresponding wiki_dataset
    context = wiki_dataset[int(similar_documents_indices[0])]['context']


    context_list.append(context)

# Add the context_list as a new column "context" to the qna_df DataFrame
qna_df['context'] = context_list

# Save the Q&A DataFrame to a CSV file
qna_df.to_csv('openbook-qna-data.csv', index=False)

# Display the modified DataFrame
qna_df.head()

In [None]:
qna_df.head

In [None]:
from google.colab import drive
drive.mount('drive')

In [None]:
!cp openbook-qna-data.csv "drive/My Drive/"