In [26]:
import openai
import pinecone
from datasets import load_dataset
import os

In [27]:
pinecone_api_key = os.getenv("pinecone_api_key")
pinecone_environment = "asia-northeast1-gcp"
openai_api_key = os.getenv("OPENAI_API_KEY")


In [28]:
MODEL = "text-embedding-ada-002"
res = openai.Embedding.create(
    input=[
        "Sample document text goes here",
        "there will be several phrases in each batch"
    ], engine=MODEL
)

In [29]:
res

<OpenAIObject list at 0x24a6c3ab270> JSON: {
  "data": [
    {
      "embedding": [
        -0.0031135426834225655,
        0.011766765266656876,
        -0.00509151816368103,
        -0.027159256860613823,
        -0.01633599027991295,
        0.03237545117735863,
        -0.016160769388079643,
        -0.0010808103252202272,
        -0.02583836019039154,
        -0.006641550455242395,
        0.02012345939874649,
        0.016672953963279724,
        -0.009178885258734226,
        0.02331787347793579,
        -0.010149340145289898,
        0.013458321802318096,
        0.02527226135134697,
        -0.016915567219257355,
        0.012056553736329079,
        -0.01636294648051262,
        -0.004303023684769869,
        -0.006402306258678436,
        -0.00437378603965044,
        0.020810864865779877,
        -0.010567175224423409,
        -0.003726816037669778,
        0.013626803644001484,
        -0.02635054476559162,
        -0.0004172029148321599,
        -0.0021852082572877407,
  

In [30]:
print(f"vector 0: {len(res['data'][0]['embedding'])}\nvector 1: {len(res['data'][1]['embedding'])}")

vector 0: 1536
vector 1: 1536


In [31]:
embeds = [record['embedding'] for record in res['data']]
len(embeds)

2

In [32]:
len(embeds[0])

1536

In [33]:

index_name = 'semantic-search-openai'
pinecone.init(api_key=pinecone_api_key, environment=pinecone_environment)

if index_name not in pinecone.list_indexes():
    pinecone.create_index(index_name, dimension=len(embeds[0]))

index = pinecone.Index(index_name)

In [34]:
trec = load_dataset('trec', split='train[:1000]')

Found cached dataset trec (C:/Users/fish/.cache/huggingface/datasets/trec/default/2.0.0/f2469cab1b5fceec7249fda55360dfdbd92a7a5b545e91ea0f78ad108ffac1c2)


In [35]:
trec[1]

{'text': 'What films featured the character Popeye Doyle ?',
 'coarse_label': 1,
 'fine_label': 5}

In [36]:
for i in trec:
    print(i)

{'text': 'How did serfdom develop in and then leave Russia ?', 'coarse_label': 2, 'fine_label': 26}
{'text': 'What films featured the character Popeye Doyle ?', 'coarse_label': 1, 'fine_label': 5}
{'text': "How can I find a list of celebrities ' real names ?", 'coarse_label': 2, 'fine_label': 26}
{'text': 'What fowl grabs the spotlight after the Chinese Year of the Monkey ?', 'coarse_label': 1, 'fine_label': 2}
{'text': 'What is the full form of .com ?', 'coarse_label': 0, 'fine_label': 1}
{'text': 'What contemptible scoundrel stole the cork from my lunch ?', 'coarse_label': 3, 'fine_label': 29}
{'text': "What team did baseball 's St. Louis Browns become ?", 'coarse_label': 3, 'fine_label': 28}
{'text': 'What is the oldest profession ?', 'coarse_label': 3, 'fine_label': 30}
{'text': 'What are liver enzymes ?', 'coarse_label': 2, 'fine_label': 24}
{'text': 'Name the scar-faced bounty hunter of The Old West .', 'coarse_label': 3, 'fine_label': 29}
{'text': 'When was Ozzy Osbourne born ?'

In [37]:
count = 0  # we'll use the count to create unique IDs
batch_size = 32  # process everything in batches of 32
for i in range(0, len(trec['text']), batch_size):
    # set end position of batch
    i_end = min(i+batch_size, len(trec['text']))
    # get batch of lines and IDs
    lines_batch = trec['text'][i: i+batch_size]
    ids_batch = [str(n) for n in range(i, i_end)]
    # create embeddings
    res = openai.Embedding.create(input=lines_batch, engine=MODEL)
    embeds = [record['embedding'] for record in res['data']]
    # prep metadata and upsert batch
    meta = [{'text': line} for line in lines_batch]
    to_upsert = zip(ids_batch, embeds, meta)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))

In [38]:
query = "What caused the 1929 Great Depression?"

xq = openai.Embedding.create(input=query, engine=MODEL)['data'][0]['embedding']

res = index.query([xq], top_k=5, include_metadata=True)
res

{'matches': [{'id': '932',
              'metadata': {'text': 'Why did the world enter a global '
                                   'depression in 1929 ?'},
              'score': 0.917954087,
              'values': []},
             {'id': '787',
              'metadata': {'text': "When was `` the Great Depression '' ?"},
              'score': 0.87167418,
              'values': []},
             {'id': '400',
              'metadata': {'text': 'What crop failure caused the Irish Famine '
                                   '?'},
              'score': 0.812258482,
              'values': []},
             {'id': '775',
              'metadata': {'text': 'What historical event happened in Dogtown '
                                   'in 1899 ?'},
              'score': 0.798895657,
              'values': []},
             {'id': '481',
              'metadata': {'text': 'What caused the Lynmouth floods ?'},
              'score': 0.79227531,
              'values': []}],
 'namespac

In [39]:
for match in res['matches']:
    print(f"{match['score']:.2f}: {match['metadata']['text']}")

0.92: Why did the world enter a global depression in 1929 ?
0.87: When was `` the Great Depression '' ?
0.81: What crop failure caused the Irish Famine ?
0.80: What historical event happened in Dogtown in 1899 ?
0.79: What caused the Lynmouth floods ?


In [40]:
query = "What was the cause of the major recession in the early 20th century?"

# create the query embedding
xq = openai.Embedding.create(input=query, engine=MODEL)['data'][0]['embedding']

# query, returning the top 5 most similar results
res = index.query([xq], top_k=5, include_metadata=True)

for match in res['matches']:
    print(f"{match['score']:.2f}: {match['metadata']['text']}")

0.88: Why did the world enter a global depression in 1929 ?
0.83: When was `` the Great Depression '' ?
0.81: What crop failure caused the Irish Famine ?
0.80: When did World War I start ?
0.80: What were popular songs and types of songs in the 1920s ?


In [41]:
query = "Why was there a long-term economic downturn in the early 20th century?"

# create the query embedding
xq = openai.Embedding.create(input=query, engine=MODEL)['data'][0]['embedding']

# query, returning the top 5 most similar results
res = index.query([xq], top_k=5, include_metadata=True)

res

{'matches': [{'id': '932',
              'metadata': {'text': 'Why did the world enter a global '
                                   'depression in 1929 ?'},
              'score': 0.898477376,
              'values': []},
             {'id': '787',
              'metadata': {'text': "When was `` the Great Depression '' ?"},
              'score': 0.842935622,
              'values': []},
             {'id': '262',
              'metadata': {'text': 'When did World War I start ?'},
              'score': 0.802457333,
              'values': []},
             {'id': '400',
              'metadata': {'text': 'What crop failure caused the Irish Famine '
                                   '?'},
              'score': 0.800769389,
              'values': []},
             {'id': '864',
              'metadata': {'text': 'When did the Dow first reach ?'},
              'score': 0.798610508,
              'values': []}],
 'namespace': ''}

In [42]:
contexts = [item['metadata']['text'] for item in res['matches']]

augmented_query = "\n\n---\n\n".join(contexts)+"\n\n-----\n\n"+query

In [43]:
primer = f"""You are Q&A bot. A highly intelligent system that answers
user questions based on the information provided by the user above
each question. If the information can not be found in the information
provided by the user you truthfully say "I don't know".
"""

res = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": primer},
        {"role": "user", "content": augmented_query}
    ]
)

In [44]:
from IPython.display import Markdown

display(Markdown(res['choices'][0]['message']['content']))

The world entered a global depression in 1929 due to a stock market crash in the United States. 

The Great Depression lasted from 1929 to 1939. 

World War I started on July 28, 1914. 

The Irish Famine, also known as the Great Famine, was caused by the failure of the potato crop from 1845-1852. 

The Dow Jones Industrial Average was first calculated on May 26, 1896. 

There were various factors that contributed to the long-term economic downturn in the early 20th century, including overproduction and unequal distribution of wealth, as well as the effects of World War I and the Great Depression.

In [46]:
res = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": primer},
        {"role": "user", "content": query}
    ]
)
display(Markdown(res['choices'][0]['message']['content']))

The early 20th century was marked by several economic downturns, including the Panic of 1907 and the Great Depression of the 1930s. The causes of these downturns were varied and complex, but some factors that contributed to them include stagnant wages for workers, excess investment and speculation in the stock market, overproduction of goods, and a decline in international trade due to protectionist policies. Additionally, factors such as the collapse of agricultural prices, bank failures, and the reduction of consumer spending all played roles in the economic downturns of the early 20th century.

In [47]:
res = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You are Q&A bot. A highly intelligent system that answers user questions"},
        {"role": "user", "content": query}
    ]
)
display(Markdown(res['choices'][0]['message']['content']))

There were several factors that contributed to the long-term economic downturn in the early 20th century. One of the major causes was the stock market crash of 1929, which led to the Great Depression. This was a period of economic recession that lasted for many years and saw high levels of unemployment and poverty.

Other factors that contributed to the downturn included the decline of the agricultural sector, the rise of protectionism and trade barriers, and the end of World War I, which disrupted the global economy. Additionally, excessive speculation and overproduction in various industries also contributed to the economic problems of the time.

In [48]:
#pinecone.delete_index(index_name)