In [1]:
import requests
import json

def generate_question(chunk, model_name="llama3.2"):
    """
    Generate a single question-answer pair from a given text chunk using an Ollama model.
    
    Args:
        chunk (str): The text chunk to generate a QA pair from
        model_name (str): The name of the Ollama model to use (default: "llama3.2")
        
    Returns:
        str: Generated question-answer pair in tuple format
    """
    # Ollama API endpoint (default is localhost on port 11434)
    api_url = "http://localhost:11434/api/generate"
    
    # Create the prompt for question and answer generation
    prompt = f"""
    You are a helpful assistant that generates a question-answer pair from text.
    
    Generate exactly 1 relevant question-answer pair from the following text:
    
    {chunk}
    
    Format requirements:
    1. Output the pair as a Python tuple format: ("Question text", "Answer text")
    2. Make sure the answer is directly supported by the provided text
    3. Do not include ANY explanations, comments, or additional text
    4. Do not include any numbering or prefixes
    5. Ensure the tuple formatting is correct with proper quotes and parentheses
    """
    
    # Prepare the request payload
    payload = {
        "model": model_name,
        "prompt": prompt,
        "stream": False
    }
    
    try:
        # Make the API request to Ollama
        response = requests.post(api_url, json=payload)
        response.raise_for_status()  # Raise exception for HTTP errors
        
        # Parse the response
        result = response.json()
        return result["response"].strip()
    
    except requests.exceptions.RequestException as e:
        print(f"Error communicating with Ollama: {e}")
        return None
    except (KeyError, json.JSONDecodeError) as e:
        print(f"Error processing Ollama response: {e}")
        return None

In [3]:
chunk = "The Los Angeles Dodgers won the World Series in 2020. The games were played in Arlington, Texas, at Globe Life Field due to the COVID-19 pandemic."

questions = generate_question(chunk, model_name="llama3.3:latest")
print(questions)

("Where were the 2020 World Series games played?", "Globe Life Field in Arlington, Texas")


In [68]:
# questions.strip().split("\n")

In [2]:
## read csv file
import pandas as pd

df = pd.read_csv("all_combined.csv", lineterminator='\n')
print(len(df["text"]))



11747


In [70]:
model_name = "llama3.3:latest"
# Create a list to store all questions
all_questions = []
# Create a file to log generation results
log_file = open(f"{model_name}_question_generation_log.txt", "w")

for i in range(len(df)):
    if i % 100 == 0:
        print("current row index: ", i)
    attempts = 0
    max_attempts = 5
    question = None
    
    while attempts < max_attempts:
        try:
            question = generate_question(df["text"][i], model_name=model_name)
            # print("The text is: ", df["text"][i])
            if question:  # If we got a valid response
                # print("-------- ---------")
                # print(question)
                # print("-------- ---------")
                break
            raise Exception("Empty response received")
            
        except Exception as e:
            attempts += 1
            print(f"Attempt {attempts} failed: {str(e)}")
            if attempts == max_attempts:
                print(f"Failed to generate question after {max_attempts} attempts")
    
    # Add the question to our list (empty string if generation failed)
    if question:
        all_questions.append(question)
        log_file.write(f"{i}\t{question}\n")
    else:
        all_questions.append("")
        log_file.write(f"{i}\t\n")

# Close the log file
log_file.close()

# After collecting all questions, add them as a new column
column_name = f"{model_name}_question"
df[column_name] = all_questions

# Save the final dataframe once at the end
df.to_csv("all_combined.csv", index=False)

current row index:  0


KeyboardInterrupt: 

In [10]:
df = pd.read_csv("all_combined.csv", lineterminator='\n')
print(df.head(10))
df['llama3.2_question_text'] = df['llama3.2_question'].str.extract(r'[\(\[][\"\']([^\"\']*)[\"\']')
df['llama3.2_answer'] = df['llama3.2_question'].str.extract(r'[\"\'],[^,]*[\"\']([^\"\']*)[\"\'][\)\]]')

# Create a new dataframe with just the extracted columns
extracted_df = df[['source', 'text', 'llama3.2_question_text', 'llama3.2_answer']]

# Save the extracted data to a new CSV file
extracted_df.to_csv("all_combined.csv", index=False)
print("Saved extracted question-answer pairs to all_combined.csv")




                                              source  \
0  https://www.heinzhistorycenter.org/event/irish...   
1  https://www.heinzhistorycenter.org/event/one-o...   
2  https://www.heinzhistorycenter.org/event/histo...   
3  https://www.heinzhistorycenter.org/event/speak...   
4  https://www.heinzhistorycenter.org/event/gut-y...   
5  https://www.heinzhistorycenter.org/event/natio...   
6  https://www.heinzhistorycenter.org/event/natio...   
7  https://www.heinzhistorycenter.org/event/ameri...   
8  https://www.heinzhistorycenter.org/event/vietn...   
9  https://www.heinzhistorycenter.org/event/vinta...   

                                                text  \
0  Skip Main NavigationSkip to site alert\nHeinz ...   
1  Skip Main NavigationSkip to site alert\nHeinz ...   
2  Skip Main NavigationSkip to site alert\nHeinz ...   
3  Skip Main NavigationSkip to site alert\nHeinz ...   
4  Skip Main NavigationSkip to site alert\nHeinz ...   
5  Skip Main NavigationSkip to site alert\nHein

Starting parallel processing with 40 workers


In [12]:
df = pd.read_csv("all_combined.csv", lineterminator='\n')
df.head()

Unnamed: 0,source,text,llama3.2_question_text,llama3.2_answer
0,https://www.heinzhistorycenter.org/event/irish...,Skip Main NavigationSkip to site alert\nHeinz ...,What is the date of the Irish Genealogy Workshop?,"Tuesday, Mar. 4, 2025"
1,https://www.heinzhistorycenter.org/event/one-o...,Skip Main NavigationSkip to site alert\nHeinz ...,What is a personal consultation with the Ulste...,A 30-minute consultation session where you wil...
2,https://www.heinzhistorycenter.org/event/histo...,Skip Main NavigationSkip to site alert\nHeinz ...,When does History Uncorked: Mirror Ball take p...,"Friday, Mar. 7, 2025"
3,https://www.heinzhistorycenter.org/event/speak...,Skip Main NavigationSkip to site alert\nHeinz ...,What is the title of Jason A. Cherry,Factor of Ambition
4,https://www.heinzhistorycenter.org/event/gut-y...,Skip Main NavigationSkip to site alert\nHeinz ...,What does Gut Yontif mean?,Have a good holiday!
