# Import Libraries

In [None]:
!pip install --upgrade transformers optimum
!pip install accelerate
!pip install auto-gptq
!pip install faiss-gpu
!pip install sentence-transformers

Collecting transformers
  Downloading transformers-4.41.2-py3-none-any.whl (9.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m55.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting optimum
  Downloading optimum-1.20.0-py3-none-any.whl (418 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m418.4/418.4 kB[0m [31m46.9 MB/s[0m eta [36m0:00:00[0m
Collecting coloredlogs (from optimum)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from optimum)
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11->optimum)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)


In [None]:
import os
import gc
import re
import pickle
import subprocess
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import faiss
from faiss import write_index, read_index
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt

import torch
import ctypes
libc = ctypes.CDLL("libc.so.6")
from datasets import Dataset
from transformers import AutoTokenizer , AutoTokenizer, AutoModelForCausalLM , pipeline
import warnings
warnings.filterwarnings("ignore")

# Download Resources

In [None]:
os.makedirs('dpq-wiki-parsed', exist_ok=True)
os.chdir('dpq-wiki-parsed')

file_names = [
    'a.parquet',
    'b.parquet',
    'c.parquet',
    'chunk_index.parquet',
    'd.parquet',
    'e.parquet',
    'f.parquet',
    'g.parquet',
    'h.parquet',
    'i.parquet',
    'j.parquet',
    'k.parquet',
    'l.parquet',
    'm.parquet',
    'n.parquet',
    'number.parquet',
    'o.parquet',
    'p.parquet',
    'q.parquet',
    'r.parquet',
    's.parquet',
    't.parquet',
    'u.parquet',
    'v.parquet',
    'w.parquet',
    'wiki_index.parquet',
    'x.parquet',
    'y.parquet',
    'z.parquet'
]
repo_id = "dpquoc/wiki-parsed"

# Create a list to store subprocess.Popen objects
processes = []

for file_name in file_names:
    link = f'https://huggingface.co/datasets/{repo_id}/resolve/main/{file_name}'
    command = ["wget", link]

    # Redirect output to /dev/null (Linux) or NUL (Windows)
    processes.append(subprocess.Popen(command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL))

# Wait for all processes to complete
for process in processes:
    process.wait()

print("All downloads initiated.")
os.chdir('..')

All downloads initiated.


In [None]:
os.makedirs('bge-small-en', exist_ok=True)
os.chdir('bge-small-en')

file_names = [
    'config.json',
    'config_sentence_transformers.json',
    'modules.json',
    'pytorch_model.bin',
    'sentence_bert_config.json',
    'special_tokens_map.json',
    'tokenizer.json',
    'tokenizer_config.json',
    'vocab.txt'
]

repo_id = "BAAI/bge-small-en-v1.5"

# Create a list to store subprocess.Popen objects
processes = []

for file_name in file_names:
    link = f'https://huggingface.co/{repo_id}/resolve/main/{file_name}'
    command = ["wget", link]

    # Redirect output to /dev/null (Linux) or NUL (Windows)
    processes.append(subprocess.Popen(command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL))

# Wait for all processes to complete
for process in processes:
    process.wait()

print("All downloads initiated.")

folder_name = '1_Pooling'
folder_path = os.path.join(os.getcwd(), folder_name)
os.makedirs(folder_path, exist_ok=True)
os.chdir(folder_path)
!wget https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/1_Pooling/config.json
os.chdir('..')
os.chdir('..')

All downloads initiated.
--2024-06-02 07:19:37--  https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/1_Pooling/config.json
Resolving huggingface.co (huggingface.co)... 54.192.18.10, 54.192.18.15, 54.192.18.113, ...
Connecting to huggingface.co (huggingface.co)|54.192.18.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 190 [text/plain]
Saving to: ‘config.json’


2024-06-02 07:19:37 (128 MB/s) - ‘config.json’ saved [190/190]



In [None]:
os.makedirs('ARC', exist_ok=True)
os.chdir('ARC')

os.makedirs('ARC-Challenge', exist_ok=True)
os.chdir('ARC-Challenge')

file_names = [
    'test.csv',
    'train.csv',
    'valid.csv',
]

repo_id = "dpquoc/ARC"

# Create a list to store subprocess.Popen objects
processes = []

for file_name in file_names:
    link = f'https://huggingface.co/datasets/{repo_id}/resolve/main/ARC-Challenge/{file_name}'
    command = ["wget", link]

    # Redirect output to /dev/null (Linux) or NUL (Windows)
    processes.append(subprocess.Popen(command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL))

# Wait for all processes to complete
for process in processes:
    process.wait()

print("All downloads initiated.")
os.chdir('..')


os.makedirs('ARC-Easy', exist_ok=True)
os.chdir('ARC-Easy')

# Create a list to store subprocess.Popen objects
processes = []

for file_name in file_names:
    link = f'https://huggingface.co/datasets/{repo_id}/resolve/main/ARC-Easy/{file_name}'
    command = ["wget", link]

    # Redirect output to /dev/null (Linux) or NUL (Windows)
    processes.append(subprocess.Popen(command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL))

# Wait for all processes to complete
for process in processes:
    process.wait()

print("All downloads initiated.")
os.chdir('..')
os.chdir('..')

All downloads initiated.
All downloads initiated.


In [None]:
os.makedirs('dpq-wiki-faiss', exist_ok=True)
os.chdir('dpq-wiki-faiss')

!wget https://huggingface.co/datasets/dpquoc/wiki-faiss-index/resolve/main/chunk_faiss.index
os.chdir('..')

--2024-06-02 07:19:39--  https://huggingface.co/datasets/dpquoc/wiki-faiss-index/resolve/main/chunk_faiss.index
Resolving huggingface.co (huggingface.co)... 54.192.18.10, 54.192.18.15, 54.192.18.113, ...
Connecting to huggingface.co (huggingface.co)|54.192.18.10|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs-us-1.huggingface.co/repos/e6/0a/e60aa7325444e8626da50f5892bcf2232118b23264e6dcd9a88c28f638dbe198/21d777fe71753629d1209ac9769635122b9750f1eeaa03254aad18c6bef61077?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27chunk_faiss.index%3B+filename%3D%22chunk_faiss.index%22%3B&Expires=1717571979&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcxNzU3MTk3OX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2U2LzBhL2U2MGFhNzMyNTQ0NGU4NjI2ZGE1MGY1ODkyYmNmMjIzMjExOGIyMzI2NGU2ZGNkOWE4OGMyOGY2MzhkYmUxOTgvMjFkNzc3ZmU3MTc1MzYyOWQxMjA5YWM5NzY5NjM1MTIyYjk3NTBmMWVlYWEwMzI1N

In [None]:
# Get the current working directory
current_dir = os.getcwd()
print(f"Current Directory: {current_dir}")

Current Directory: /content


# RAG

In [None]:
def get_text_chunks(indices, base_path):
    # Extract rows from chunk_index corresponding to the specified indices
    filtered_chunk_index = chunk_index.iloc[indices].reset_index(drop=True)

    # Initialize a list to store the text chunks, matching the order and length of indices
    text_chunks = [None] * len(indices)

    # Create a set of unique file names to process to minimize file reads
    unique_files = set(filtered_chunk_index['file'])

    for file_name in unique_files:
        file_path = f"{base_path}/{file_name}"
        df = pd.read_parquet(file_path)

        # Filter rows_to_process outside the loop to avoid repeated computations
        rows_to_process = filtered_chunk_index.loc[filtered_chunk_index['file'] == file_name]

        for _, row in rows_to_process.iterrows():
            original_indices = row.name
            article_id = row['id']
            offset = row['offset']

            # Directly access the row without using loc to minimize memory usage
            article_text = df[df['id'] == article_id]['text'].values[0]

            words = article_text.split()
            chunk = " ".join(words[offset[0]:offset[1]])
            text_chunks[original_indices] = chunk

        # Explicitly delete the DataFrame and call garbage collection
        del df, rows_to_process
        gc.collect()
        libc.malloc_trim(0)

    return text_chunks


In [None]:
DEVICE = 0
MAX_LENGTH = 384
BATCH_SIZE = 256
SIM_MODEL = '/content/bge-small-en'
WIKI_PATH = "/content/dpq-wiki-parsed"
wiki_files = os.listdir(WIKI_PATH)

## Reading The Dataset

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
df = pd.read_csv("/content/ARC/ARC-Easy/test.csv").drop("id", axis=1)
# df = pd.read_csv("/content/ARC/ARC-Challenge/test.csv").drop("id", axis=1)

df.fillna(' ', inplace=True)
df = df.astype(str)

get_sample = False
custom_question = True

if get_sample:
  # Define the sample index
  sample_index = 123

  df = df.iloc[[sample_index]]

if custom_question:

  # data = {
  #     'question': ['What is the chemical symbol for the element oxygen?'],
  #     'A': ['O'],
  #     'B': ['H'],
  #     'C': ['N'],
  #     'D': ['C'],
  #     'answer': ['A']
  # }

  data = {
      'question': ['When did the Ho Chi Minh University of Technology regain its traditional name after being renamed the University of Engineering?'],
      'A': ['1981'],
      'B': ['1990'],
      'C': ['1995'],
      'D': ['2001'],
      'answer': ['D']
  }

  df = pd.DataFrame(data)


df.head()

Unnamed: 0,question,A,B,C,D,answer
0,When did the Ho Chi Minh University of Technology regain its traditional name after being renamed the University of Engineering?,1981,1990,1995,2001,D


## Enhance query w LLM

In [None]:
model_name_or_path = "TheBloke/Mistral-7B-OpenOrca-GPTQ"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision="main")

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)


config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.


model.safetensors:   0%|          | 0.00/4.16G [00:00<?, ?B/s]

Some weights of the model checkpoint at TheBloke/Mistral-7B-OpenOrca-GPTQ were not used when initializing MistralForCausalLM: ['model.layers.0.mlp.down_proj.bias', 'model.layers.0.mlp.gate_proj.bias', 'model.layers.0.mlp.up_proj.bias', 'model.layers.0.self_attn.k_proj.bias', 'model.layers.0.self_attn.o_proj.bias', 'model.layers.0.self_attn.q_proj.bias', 'model.layers.0.self_attn.v_proj.bias', 'model.layers.1.mlp.down_proj.bias', 'model.layers.1.mlp.gate_proj.bias', 'model.layers.1.mlp.up_proj.bias', 'model.layers.1.self_attn.k_proj.bias', 'model.layers.1.self_attn.o_proj.bias', 'model.layers.1.self_attn.q_proj.bias', 'model.layers.1.self_attn.v_proj.bias', 'model.layers.10.mlp.down_proj.bias', 'model.layers.10.mlp.gate_proj.bias', 'model.layers.10.mlp.up_proj.bias', 'model.layers.10.self_attn.k_proj.bias', 'model.layers.10.self_attn.o_proj.bias', 'model.layers.10.self_attn.q_proj.bias', 'model.layers.10.self_attn.v_proj.bias', 'model.layers.11.mlp.down_proj.bias', 'model.layers.11.mlp.

generation_config.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.69k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
prompt = f"""Your task is to generate knowledge facts that help answer user questions, which are presented as multiple-choice questions.

Key considerations:

- Fact Generation: Create text containing relevant information that could potentially answer the user question.
- Relevance: Ensure that the generated facts are pertinent to the topic and context of the user question.
- Multiple Facts (Optional): If necessary, provide multiple facts to cover various aspects of the user prompt or related topics.


Example User Question:

Question: An astronomer observes that a planet rotates faster after a meteorite impact. Which is the most likely effect of this increase in rotation?
A. Planetary density will decrease.
B. Planetary years will become longer.
C. Planetary days will become shorter.
D. Planetary gravity will become stronger.

Assistant Response:

Faster rotation of a planet following a meteorite impact can lead to shorter planetary days.
Increased rotation speed after a meteorite impact might result in changes to planetary gravity.
Planetary rotation acceleration due to a meteorite impact could influence various planetary characteristics, including its rotational period.

user_prompt

Assistant Response:
"""

prompt_template = f'''system
You are a resourceful information assistant dedicated to generating knowledge facts for the RAG system. Utilize your understanding of Wikipedia concepts and the provided user prompt (usually a multiple-choice question) to craft informative text chunks. Remember, your goal is to provide relevant information from the RAG system to assist users in finding answers.

user
{prompt}

assistant
'''

In [None]:
def save_progress(generated_texts, filename):
  with open("eenerated_texts.pkl", 'wb') as f:
    pickle.dump(generated_texts, f)
  print(f"Generated texts saved successfully. Total: {count}")

In [None]:
count = 0  # Iteration counter
generated_texts = []

for _, row in df.iterrows():

    question = row['question']
    answer_a = row['A']
    answer_b = row['B']
    answer_c = row['C']
    answer_d = row['D']

    # Construct the user prompt dynamically using f-strings
    user_prompt = f"""User Question:
Question: {question}
A. {answer_a}
B. {answer_b}
C. {answer_c}
D. {answer_d}
"""
    full_prompt = prompt_template.replace("user_prompt", user_prompt)
    # print(full_prompt) # Print full prompt sample

    input_ids = tokenizer(full_prompt, return_tensors='pt').input_ids.cuda()
    output = model.generate(inputs=input_ids,
                            temperature=0.3,
                            do_sample=True,
                            top_p=0.95,
                            top_k=40,
                            max_new_tokens=512)


    # Skip the initial tokens corresponding to the prompt
    generated_text = tokenizer.decode(output[0][len(input_ids[0]):])

    gc.collect()
    torch.cuda.empty_cache()

    generated_texts.append(generated_text)
    count += 1

    # Save every 20 iterations
    if count % 20 == 0:
        if os.path.exists("e_generated_texts.pkl"):
            os.remove("e_generated_texts.pkl")
        save_progress(generated_texts, count)  # Create unique filenames

    gc.collect()
    torch.cuda.empty_cache()




The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


In [None]:
# Print sample generated knowledge facts
print(generated_texts[0])

The Ho Chi Minh University of Technology regained its traditional name after being renamed the University of Engineering in 1995.<|im_end|>


In [None]:
# Post process function for the generated knowledge facts

def process_text_to_multi_queries(text_array):
  """
  This function processes each text in an array into multiple queries, focusing on the first block and removing "- " prefixes.

  Args:
      text_array: A list of strings.

  Returns:
      A list of lists, where each inner list contains the processed queries from the first block for a text, without "- " prefixes.
  """
  processed_queries = []
  for text in text_array:
    # Remove the ending "<|im_end|>" tag, if it exists.
    text = text.rstrip("<|im_end|>")

    # Strip leading and trailing whitespaces.
    text = text.strip()

    # Split the text based on delimiters (e.g., "####")
    text_blocks = text.split("User Question")

    # Get queries from the first block only (assuming first block is relevant)
    if text_blocks:
      block_queries = []
      lines = text_blocks[0].splitlines()
      # Extract and clean non-empty lines
      block_queries.extend([line for line in lines if line])
      processed_queries.append(block_queries)
    else:
      # Handle cases where there are no delimiters or the first block is empty
      processed_queries.append([])

  return processed_queries

In [None]:
# Join generated knowledge facts into single query for RAG
processed_queries = process_text_to_multi_queries(generated_texts)

In [None]:
joint_queries = []
for query_list in processed_queries:
    joined_query = "\n".join(query_list)  # Join elements with semicolon separator
    joint_queries.append(joined_query)

In [None]:
print(joint_queries[0])

The Ho Chi Minh University of Technology regained its traditional name after being renamed the University of Engineering in 1995.


## Retrieval Context

In [None]:
## Combine all answers
df['all_answer'] = df.apply(lambda x: " ".join([x['A'], x['B'], x['C'], x['D']]), axis=1)

## Normal query that using question and all answer options
df['normal_query_RAG'] = df['question'] + " " + df['all_answer']

In [None]:
model_embedding = SentenceTransformer(SIM_MODEL, device='cuda')
model_embedding.max_seq_length = MAX_LENGTH
# model = model.half() # Turn 32 float to 16 float

In [None]:
# prompt_embeddings = model_embedding.encode(df.normal_query_RAG.values, batch_size=BATCH_SIZE, device=DEVICE, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)

prompt_embeddings = model_embedding.encode(joint_queries, batch_size=BATCH_SIZE, device=DEVICE, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)
prompt_embeddings = prompt_embeddings.detach().cpu().numpy()
_ = gc.collect()

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
sentence_index = read_index("/content/dpq-wiki-faiss/chunk_faiss.index")

# Search on the GPU
search_score, search_index = sentence_index.search(prompt_embeddings, 6)

In [None]:
del sentence_index
del prompt_embeddings
_ = gc.collect()
libc.malloc_trim(0)

1

In [None]:
chunk_index = pd.read_parquet("/content/dpq-wiki-parsed/chunk_index.parquet")

In [None]:
# Flatten the 2D array of indices to a 1D array
flattened_indices = search_index.flatten()

# Use the flattened array to get text chunks
text_chunks_flat = get_text_chunks(flattened_indices, WIKI_PATH)

# Reshape the flat list of text chunks back to the original 2D structure of search_index
original_shape = search_index.shape
text_chunks_reshaped = np.array(text_chunks_flat).reshape(original_shape)

In [None]:
gc.collect()
libc.malloc_trim(0)

1

In [None]:
joint_text_chunks = []

# Iterate over each sub-array in text_chunks_reshaped
for text_chunks in text_chunks_reshaped:
    # Join the text chunks for the current question into a single string
    joint_chunks = ' ---'.join(text_chunks)
    # Append the joined string to the list
    joint_text_chunks.append(joint_chunks)

In [None]:
print("----------------------------")
print(f"Question: {df['question'].values[0]}")
print(f"A. {df['A'].values[0]}")
print(f"B. {df['B'].values[0]}")
print(f"C. {df['C'].values[0]}")
print(f"D. {df['D'].values[0]}")
print(f"True Answer: {df['answer'].values[0]}")

----------------------------
Question: When did the Ho Chi Minh University of Technology regain its traditional name after being renamed the University of Engineering?
A. 1981
B. 1990
C. 1995
D. 2001
True Answer: D


In [None]:
from IPython.display import display, Markdown

# Print context chunks retrieved from RAG
display(Markdown(joint_text_chunks[0]))

16/2001/QĐ-TTg by The Prime Minister on December 2, 2001, Vietnam National University, Ho Chi Minh City was reorganized with only 3 members; and the University of Engineering gained back its traditional name as Ho Chi Minh University of Technology. Infrastructure Ho Chi Minh University of Technology (HCMUT) has two campuses: one is located in the inner of Ho Chi Minh City and the other in the outskirts of the city. The former one has an area of 14 ha, of which address is 268 Ly Thuong Kiet St., Dist.10, Ho Chi Minh City. The campus has 117 classrooms (14.479 m2), ---time when the university conducted post graduate training program. After the national reunification, the university was renamed as Ho Chi Minh University of Technology as Saigon was renamed to Ho Chi Minh City (according to the Decision 426/TTg signed by The President of the Cabinet on October 27, 1976). Being one of the three largest universities of technology in Vietnam, Ho Chi Minh University of Technology has the mission of training engineers majoring in the fields of capital construction, industry, resources exploration and exploitation, and environment preservation to provide a force of manpower mainly for the southern areas of Vietnam. ---Saigon Technology University () is a university in Ho Chi Minh City, Vietnam. It was established from the Ho Chi Minh City Technology College, founded on 24 September 1997 by Decision number 198/QĐ-TTg signed by the prime minister of Vietnam. In March 2005, the prime minister signed Decision 52/2005/QĐ-TTg to rename and recognized the upgrading of this college; the name of the university was officially changed. The university provides undergraduate and graduate education and have cooperative education agreements with Troy University from the United States. Formation and development of Saigon Technology University (STU) Saigon University of Technology started as Ho ---force of manpower mainly for the southern areas of Vietnam. Since 1981, HCMUT has expanded its scope into postgraduate training. In 1990, HCMUT opened the master's degree training system. Actually, the postgraduate training in HCMUT has been focused since 1981. According to decision 16/CP by the Prime Minister on January 17, 1995, Vietnam National University, Ho Chi Minh City (VNU-HCM) was founded. Ho Chi Minh University of Technology became one among the 9 members of VNU-HCM and was renamed as the University of Engineering. According to decision 15/2001/QĐ-TTg and decision 16/2001/QĐ-TTg by The Prime Minister on December 2, 2001, Vietnam ---Throughout its history, the university has had several name changes: the University of Indochina (Université Indochinoise, 東法大學 or Đại học Đông Dương; established in 1906), Vietnam National University (Trường Đại học Quốc gia Việt Nam; November 1945), and the University of Hanoi (Trường Đại học Tổng hợp Hà Nội; June 1956). In 1993, Vietnam National University, Hanoi (Đại học Quốc gia Hà Nội) was created by merging the University of Hanoi, Hanoi National University of Education (HNUE) and College of Foreign Languages. The institution also owns two high schools for gifted students in foreign languages (Foreign Language Specialized School) and ---Ho Chi Minh City University of Law (HCMCUL , ) is a university in Vietnam that offers undergraduate and postgraduate education in law and politics. The university plays an important role as a legal research and advisory body for the Vietnamese government in legal and public administrative reform. History The University was established in 1996, pursuant to Decision N° 1234/GD&ĐT of the Ministry of Education and Training. It was originally merged with the Ho Chi Minh City branch of the Vietnam National University. On October 10, 2000, the Prime Minister issued Decision N° 118/2000/QWĐ-TT, which separated the Ho Chi Minh

In [None]:
df['context'] = joint_text_chunks

In [None]:
df[["question", "context", "A", "B", "C", "D", 'answer']].to_csv("./df_context.csv", index=False)