In [50]:
# Creating a synthetic Q&A dataset
import pandas as pd
df = pd.read_csv('olympics-data/mock_olympics_sections.csv')
df['context'] = df.title + "\n" + df.heading + "\n\n" + df.content
df.head()

Unnamed: 0,title,heading,content,tokens,context
0,2020 Summer Olympics,Summary,"The 2020 Summer Olympics, officially the Games...",621,2020 Summer Olympics\nSummary\n\nThe 2020 Summ...
1,2020 Summer Olympics,Host city selection,The International Olympic Committee (IOC) vote...,126,2020 Summer Olympics\nHost city selection\n\nT...
2,2020 Summer Olympics,Impact of the COVID-19 pandemic,"In January 2020, concerns were raised about th...",375,2020 Summer Olympics\nImpact of the COVID-19 p...
3,2020 Summer Olympics,Qualifying event cancellation and postponement,Concerns about the pandemic began to affect qu...,298,2020 Summer Olympics\nQualifying event cancell...
4,2020 Summer Olympics,Effect on doping tests,Mandatory doping tests were being severely res...,163,2020 Summer Olympics\nEffect on doping tests\n...


In [68]:
# use 'olympics-data/mock_olympics_sections.csv' to generate questions instead of 'olympics_sections.csv'
# it takes a while to run and billing on openai api.
import os
import openai
from dotenv import load_dotenv

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

# generating questions based on the data.
def get_questions(row):
    print("["+ str(row.name + 1) + "/" + str(len(df)) + "]") # log to see progress

    try:
        # https://platform.openai.com/docs/api-reference/completions/create
        response = openai.Completion.create(
            engine="davinci-instruct-beta-v3",
            prompt=f"Write questions based on the text below\n\nText: {row.context}\n\nQuestions:\n1.",
            temperature=0,
            max_tokens=257,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=["\n\n"]
        )
        print(response['choices'][0]['text'])
        return response['choices'][0]['texta']
    except:
        return ""


df['questions']= df.apply(get_questions, axis=1)
df['questions'] = "1." + df.questions
print(df[['questions']].values[0][0])

[1/30]
 When were the 2020 Summer Olympics originally scheduled to take place?
2. When was the event postponed to 2021?
3. How much did the Games cost in total?
4. Which city hosted the 2020 Summer Olympics for the second time?
5. What new events were introduced in existing sports?
6. What was the total medal count for the United States?
7. What was the total medal count for China?
8. What was the total medal count for Japan?
9. What country won the most gold medals?
10. What was the first-ever Olympic gold medal for Bermuda?
[2/30]
 When did the IOC vote to select the host city of the 2020 Summer Olympics?
2. Which city was eliminated in the first round of voting?
3. Who won the final vote for the host city of the 2020 Summer Olympics?
[3/30]
 What are the potential impacts of COVID-19 on the 2020 Summer Olympics?
2. What did Tokyo organizers and the International Olympic Committee say about the potential impacts of COVID-19?
3. What are the challenges posed by COVID-19 for the organi

In [12]:
print(df.content.values[0])

The 2020 Summer Olympics, officially the Games of the XXXII Olympiad and also known as Tokyo 2020, was an international multi-sport event held from 23 July to 8 August 2021 in Tokyo, Japan, with some preliminary events that began on 21 July 2021. Tokyo was selected as the host city during the 125th IOC Session in Buenos Aires, Argentina, on 7 September 2013.Originally scheduled to take place from 24 July to 9 August 2020, the event was postponed to 2021 on 24 March 2020 due to the global COVID-19 pandemic, the first such instance in the history of the Olympic Games (previous games had been cancelled but not rescheduled). However, the event retained the Tokyo 2020 branding for marketing purposes. It was largely held behind closed doors with no public spectators permitted due to the declaration of a state of emergency in the Greater Tokyo Area in response to the pandemic, the first and only Olympic Games to be held without official spectators. The Games were the most expensive ever, with

In [69]:
# Create answers based on the context
def get_answers(row):
    try:
        response = openai.Completion.create(
            engine="davinci-instruct-beta-v3",
            prompt=f"Write answer based on the text below\n\nText: {row.context}\n\nQuestions:\n{row.questions}\n\nAnswers:\n1.",
            temperature=0,
            max_tokens=257,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )
        return response['choices'][0]['text']
    except Exception as e:
        print (e)
        return ""


df['answers']= df.apply(get_answers, axis=1)
df['answers'] = "1." + df.answers
df = df.dropna().reset_index().drop('index',axis=1)
print(df[['answers']].values[0][0])

1. The 2020 Summer Olympics were an international multi-sport event held from 23 July to 8 August 2021 in Tokyo, Japan.
2. Originally scheduled to take place from 24 July to 9 August 2020, the event was postponed to 2021 on 24 March 2020 due to the global COVID-19 pandemic.
3. The Games were the most expensive ever, with total spending of over $20 billion.
4. The Games were the fourth Olympic Games to be held in Japan, following the 1964 Summer Olympics (Tokyo), 1972 Winter Olympics (Sapporo), and 1998 Winter Olympics (Nagano).
5. Tokyo became the first city in Asia to hold the Summer Olympic Games twice.
6. The 2020 Games were the second of three consecutive Olympics to be held in East Asia, following the 2018 Winter Olympics in Pyeongchang, South Korea and preceding the 2022 Winter Olympics in Beijing, China.
7. Due to the one-year postponement, Tokyo 2020 was the first and only Olympic Games to have been held in an odd-numbered year and the first Summer Olympics since 1900 to be hel

In [70]:
# Save the Olympics Q&A dataset based on Wikipedia sections
# Change file name when using 'olympics_sections.csv'
df.to_csv('olympics-data/mock_olympics_qa.csv', index=False)

In [71]:
# DEPRECATED: The /search endpoint is deprecated in favour of using embeddings. Embeddings are cheaper, faster and can support a better search experience. See Question Answering Guide for a search implementation using the embeddings
df = df[df.tokens<2000]
df[['context', 'tokens']].rename(columns={'context':'text','tokens':'metadata'}).to_json('olympics-data/olympics_search.jsonl', orient='records', lines=True)

search_file = openai.File.create(
    file=open("olympics-data/olympics_search.jsonl"),
    purpose='search'
)
olympics_search_fileid = search_file['id']

InvalidRequestError: 'search' is not one of ['fine-tune'] - 'purpose'

In [None]:
# Since 'Search' API is deprecated, example ends here.
# Use Embedding method instead.
# https://github.com/openai/openai-cookbook/blob/main/examples/Question_answering_using_embeddings.ipynb