# Wikipedia QA Opean AI example

In [15]:
import openai
import os
import json

In [7]:
OPEN_AI_API_KEY = os.getenv("OPENAI_API_KEY")
openai.api_key = OPEN_AI_API_KEY
client = openai.OpenAI()

## OpenAI Model Responses without Customization


In [11]:
# creating a prompt
question_prompt = """
Who is the owner of twitter?
Answer: 
"""

# Use completion endpoint
completion = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant",
        },
        {
            "role": "user",
            "content": question_prompt,
        },
    ],
)

### Extracting the Model Response

In [23]:
json.loads(completion.json())

{'id': 'chatcmpl-8uIdWIIINa0IAxgBK3KqxKQ5NXbVV',
 'choices': [{'finish_reason': 'stop',
   'index': 0,
   'logprobs': None,
   'message': {'content': 'The CEO and co-founder of Twitter is Jack Dorsey.',
    'role': 'assistant',
    'function_call': None,
    'tool_calls': None}}],
 'created': 1708428698,
 'model': 'gpt-3.5-turbo-0125',
 'object': 'chat.completion',
 'system_fingerprint': 'fp_69829325d0',
 'usage': {'completion_tokens': 12, 'prompt_tokens': 27, 'total_tokens': 39}}

#### Extracting Response Text


In [25]:
print(completion.choices[0].message)

ChatCompletionMessage(content='The CEO and co-founder of Twitter is Jack Dorsey.', role='assistant', function_call=None, tool_calls=None)


## Get external data

In [42]:
import requests
import pandas as pd
from dateutil.parser import parse, ParserError

* [Wikipedia API documentation](https://en.wikipedia.org/w/api.php)

In [31]:
# Get the Wikipedia page for "2022" since OpenAI's models stop in 2021
params = {
    "action": "query",
    "prop": "extracts",
    "exlimit": 1,
    "titles": "2022",
    "explaintext": 1,
    "formatversion": 2,
    "format": "json",
}
resp = requests.get("https://en.wikipedia.org/w/api.php", params=params)
response_dict = resp.json()
response_dict["query"]["pages"][0]["extract"].split("\n")[:10]

['2022 (MMXXII) was a common year starting on Saturday of the Gregorian calendar, the 2022nd year of the Common Era (CE) and Anno Domini (AD) designations, the 22nd  year of the 3rd millennium and the 21st century, and the  3rd   year of the 2020s decade.  ',
 'The year 2022 saw the removal of nearly all COVID-19 restrictions and the reopening of international borders in most countries, and the global rollout of COVID-19 vaccines continued. The global economic recovery from the pandemic continued, though many countries experienced an ongoing inflation surge; in response, many central banks raised their interest rates to landmark levels. The world population reached eight billion people in 2022, though the year also witnessed numerous natural disasters, including two devastating Atlantic hurricanes (Fiona and Ian), and the most powerful volcano eruption of the century so far. The later part of the year also saw the first public release of ChatGPT by OpenAI starting an arms race in artif

### Clean the data

In [32]:
import pandas as pd

# Load page text into a dataframe
df = pd.DataFrame()
df["text"] = response_dict["query"]["pages"][0]["extract"].split("\n")
df.head()

Unnamed: 0,text
0,2022 (MMXXII) was a common year starting on Sa...
1,The year 2022 saw the removal of nearly all CO...
2,2022 was also dominated by wars and armed conf...
3,
4,


In [55]:
def clean_wikipedia_data(df: pd.DataFrame) -> pd.DataFrame:
    df_cleaned = df.copy()
    df_cleaned = df = df_cleaned[
        (df_cleaned["text"].str.len() > 0) & (~df_cleaned["text"].str.startswith("=="))
    ]

    return df_cleaned


def parse_dates(df_cleaned: pd.DataFrame) -> pd.DataFrame:
    # In some cases dates are used as headings instead of being part of the
    # text sample; adjust so dated text samples start with dates
    prefix = ""
    for i, row in df_cleaned.iterrows():
        # If the row already has " - ", it already has the needed date prefix
        if " – " not in row["text"]:
            try:
                # If the row's text is a date, set it as the new prefix
                parse(row["text"])
                prefix = row["text"]
            except ParserError:
                # If the row's text isn't a date, add the prefix
                row["text"] = prefix + " – " + row["text"]
                
    df_cleaned = df_cleaned[df_cleaned["text"].str.contains(" – ")].reset_index(
        drop=True
    )
    
    return df_cleaned


df_cleaned = clean_wikipedia_data(df)
df_cleaned = parse_dates(df_cleaned)

In [57]:
df_cleaned.tail()

Unnamed: 0,text
177,December 21–December 26 – A major winter storm...
178,December 24 – 2022 Fijian general election: Th...
179,December 29 – Brazilian football legend Pelé d...
180,December 31 – Former Pope Benedict XVI dies at...
181,December 7 – The world population was estimate...


In [58]:
df_cleaned.to_csv("data/wikipedia_data.csv")

## Convert into embeddings

To create our chatbot, we'll need to convert our natural language data into numeric representations that our machine learning model can process. We need these representations to capture the relationships within the data so that the model can recognize patterns and identify the most relevant content.



In [60]:
import pandas as pd
import openai
import numpy as np


OPEN_AI_API_KEY = os.getenv("OPENAI_API_KEY")
openai.api_key = OPEN_AI_API_KEY
client = openai.OpenAI()

df = pd.read_csv("data/wikipedia_data.csv", index_col=0)
df.sample(5)

Unnamed: 0,text
47,March 11 – Gabriel Boric is sworn in as Presid...
40,March 4 – Insurgency in Khyber Pakhtunkhwa: An...
28,February 28 – The Intergovernmental Panel on C...
161,November 11 – Russian invasion of Ukraine: Ukr...
26,February 27 – Russian invasion of Ukraine: Eur...


In [63]:
EMBEDDING_MODEL_NAME = "text-embedding-ada-002"

def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding


df["embeddings"] = df.text.apply(
    lambda x: get_embedding(x, model=EMBEDDING_MODEL_NAME)
)
df.to_csv("data/wikipedia_embeddings.csv")

In [65]:
df = pd.read_csv("data/wikipedia_embeddings.csv", index_col=0)
df["embeddings"] = df.embeddings.apply(eval).apply(np.array)
df.sample(5)

Unnamed: 0,text,embeddings
47,March 11 – Gabriel Boric is sworn in as Presid...,"[-0.012513521127402782, -0.009597179479897022,..."
139,September 27–September 30 – Hurricane Ian impa...,"[-0.026857515797019005, -0.01812230423092842, ..."
90,June 19 – The second round of the 2022 legisla...,"[-0.0288669615983963, -0.002565045142546296, -..."
17,February 3 – Islamic State leader Abu Ibrahim ...,"[-0.038417767733335495, 0.007409140467643738, ..."
51,March 21 – China Eastern Airlines Flight 5735 ...,"[-0.012071493081748486, -0.005423060152679682,..."
