## Install dependencies

In [17]:
%%capture
!pip install tqdm vertexai sentencepiece matplotlib
!gcloud auth application-default login

## Put Data in JSON file

In [None]:

# import os
# import json

# # Directory containing the data files
# data_dir = "./data/middle"

# # Initialize an empty list to store data
# all_data = []

# # Loop through all files in the data directory
# for filename in os.listdir(data_dir):
#     if filename.endswith(".txt"):
#         # Read the JSON content of the file
#         with open(os.path.join(data_dir, filename), "r") as file:
#             file_data = json.load(file)
        
#         # Append the data to the list
#         all_data.append(file_data)

# # Write the data to a JSON file
# with open("middle.json", "w") as json_file:
#     json.dump(all_data, json_file, indent=4)
import json

# Parameters
input_file = "train.jsonl"  # Input file with data

# Load the JSONL data from the file and calculate passage lengths
total_length = 0
num_entries = 0

with open(input_file, 'r', encoding='utf-8') as file:
    for line in file:
        entry = json.loads(line)
        passage_length = len(entry['passage'])
        total_length += passage_length
        num_entries += 1

# Compute the average passage length
if num_entries > 0:
    average_length = total_length / num_entries
    print(f"Average Passage Length: {average_length}")
else:
    print("No entries found in the file.")

## Summarize paragraph -> Answer Questions

In [18]:
import json
from tqdm import tqdm
from vertexai.generative_models import GenerativeModel
from vertexai import generative_models

# Parameters
input_file = "train.jsonl"  # Input file with data
output_file = "results_train.jsonl"  # Output file
summary_query = "Summarize this paragraph to make it as concise as possible, capturing the main ideas clearly and briefly without losing information. Provide the edited paragraph as-is without any additional context, labels, headings, or formatting."
# Load the JSONL data from the file
data = []
with open(input_file, 'r', encoding='utf-8') as file:  # Specify UTF-8 encoding here
    for line in file:
        data.append(json.loads(line))

# Model setup for summarization
summary_model = GenerativeModel("gemini-1.0-pro-002")
answer_model = GenerativeModel("gemini-1.0-pro-002")  # Assuming the same model for summarization and answering
safety_config = {
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_NONE,
}
limit = -1
# Process each item
output_data = []
for item in tqdm(data[:limit], desc="Processing"):
    # Generate concise summary
    try:
        summary_response = summary_model.generate_content(
            f"{summary_query} {item['passage']}",
            generation_config={"max_output_tokens": 150, "temperature": 0.5},
            safety_settings=safety_config)
        item['passage_summarized'] = summary_response.text.strip()
    except Exception as e:
        print(f"Error summarizing passage for question {item['question']}: {str(e)}")
        item['passage_summarized'] = "Error summarizing"
        continue

    # Answer question based on summary
    answer_query = f"{item['passage_summarized']}. Based on this summary, answer the following question with a simple 'true' or 'false'. '{item['question']}'. Provide the response as-is without any additional context, labels, headings, or formatting:"
    try:
        answer_response = answer_model.generate_content(
            answer_query,
            generation_config={"max_output_tokens": 5, "temperature": 0},
            safety_settings=safety_config)
        item['model_answer'] = answer_response.text.strip()
    except Exception as e:
        print(f"Error answering question {item['question']}: {str(e)}")
        item['model_answer'] = "Error answering"

    output_data.append(item)

# Write the updated data to a JSONL file
with open(output_file, 'w') as file:
    for line in output_data:
        json.dump(line, file)
        file.write('\n')


Processing:  10%|▉         | 928/9426 [25:25<2:48:39,  1.19s/it]

Error summarizing passage for question is todd bethany's dad in coronation street: Response has no candidates (and no text).


Processing:  19%|█▉        | 1800/9426 [47:48<3:05:34,  1.46s/it]

Error answering question is 16 the age of consent in canada: Response has no candidates (and no text).


Processing:  28%|██▊       | 2680/9426 [1:10:29<2:46:17,  1.48s/it]

Error summarizing passage for question is straight talk the same as total wireless: Content has no parts.


Processing:  36%|███▋      | 3438/9426 [1:30:08<2:32:27,  1.53s/it]

Error summarizing passage for question is there a white castle restaurant in california: Content has no parts.


Processing:  38%|███▊      | 3552/9426 [1:33:06<2:07:23,  1.30s/it]

Error summarizing passage for question does anna have a baby in 50 shades freed: Response has no candidates (and no text).


Processing:  43%|████▎     | 4017/9426 [1:44:58<2:34:59,  1.72s/it]

Error summarizing passage for question does anna get pregnant in fifty shades freed: Response has no candidates (and no text).


Processing:  60%|█████▉    | 5616/9426 [2:26:16<1:27:33,  1.38s/it]

Error summarizing passage for question has anyone ever won the moment of truth: Response has no candidates (and no text).


Processing:  63%|██████▎   | 5939/9426 [2:34:45<1:24:51,  1.46s/it]

## Visualize results


In [16]:
import json
# Load the JSON data from the processed file
input_file = 'results_train.jsonl'  # Adjust the path as needed
data = []
with open(input_file, 'r', encoding='utf-8') as file:  # Specify UTF-8 encoding here
    for line in file:
        data.append(json.loads(line))

total_questions = 0
correct_answers = 0

# Iterate through each entry in the JSON data
for item in data:
    # Retrieve the list of correct answers and model-generated answers
    correct = item['answer']
    model_generated = item['model_answer']

    # Increment the total question count
    total_questions += 1

    # Compare each answer with the model answer and count correct ones
    model_answer_bool = model_generated.lower() == 'true'  # Convert string to Boolean
    if correct == model_answer_bool:
        correct_answers += 1

# Calculate the accuracy
accuracy = (correct_answers / total_questions) * 100 if total_questions > 0 else 0

# Output the results
print(f'Total questions: {total_questions}')
print(f'Correct answers: {correct_answers}')
print(f'Accuracy: {accuracy:.2f}%')

Total questions: 30
Correct answers: 26
Accuracy: 86.67%
