In [1]:
import requests

# Get Wikipedia page for 2023 Turkey-Syria Earthquake
params = {
    "action": "query", 
    "prop": "extracts", 
    "exlimit": 1, 
    "titles": "2023_Turkey-Syria_earthquake", 
    "explaintext": 1, 
    "formatversion": 2, 
    "format": "json"
}
resp = requests.get("https://en.wikipedia.org/w/api.php", params=params)
response_dict = resp.json()

In [4]:
text_data = response_dict["query"]["pages"][0]["extract"].split("\n")
text_data

['']

In [9]:
import requests
from bs4 import BeautifulSoup

def extract_wikipedia_page_content(title):
    # Format the title to be URL-friendly
    formatted_title = title.replace(' ', '_')
    url = f"https://en.wikipedia.org/wiki/{formatted_title}"

    response = requests.get(url)

    if response.status_code != 200:
        return f"Error: Unable to access page titled '{title}'."

    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the content text
    content_div = soup.find('div', {'class': 'mw-parser-output'})
    if not content_div:
        return f"Error: Unable to find content for page titled '{title}'."

    # Extract all paragraphs within the content div
    paragraphs = content_div.find_all('p')
    page_content = [para.get_text() for para in paragraphs]

    return page_content

# Example usage
title = "GPT-4"
page_content = extract_wikipedia_page_content(title)
print(page_content[0])





In [11]:
import pandas as pd

# Load page content into dataframe
df = pd.DataFrame()
df['text'] = page_content

# CLean dataframe to remove empty cells
df = df[(
    (df["text"].str.len() > 0) & (~df["text"].str.startswith("=="))
)].reset_index(drop=True)
df.head()

Unnamed: 0,text
0,\n
1,Generative Pre-trained Transformer 4 (GPT-4) i...
2,Observers reported that the iteration of ChatG...
3,OpenAI introduced the first GPT model (GPT-1) ...
4,Rumors claim that GPT-4 has 1.76 trillion para...


## Create Embeddings Index

In [None]:
from openai import OpenAI
client = OpenAI()

def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

df['ada_embedding'] = df.combined.apply(lambda x: get_embedding(x, model='text-embedding-3-small'))
df.to_csv('output/embedded.csv', index=False)