### Imports

In [2]:
import re
import pandas as pd
import numpy as np
import openai
import pinecone
import configparser

### Read data

In [3]:
df = pd.read_csv('../data/processed/embeddings.csv', dtype={'index': 'string'})
df['embedding_ada'] = np.array(df.embedding_ada.apply(eval))
df.head(3)

Unnamed: 0,url,index,resolved,status,datetime_utc,error,filename,mimetype,encoding,extract_error,...,comments,author,categories,tags,date,sitename,clean_content,combined,n_tokens,embedding_ada
0,https://unamglobal.unam.mx/comunidades-indigen...,46,,200,2023-04-02T15:25:07.419616,,16dd649be430106e823d0f6ad1d7c638.html,text/html,utf-8,,...,,Beto Torres,BLOG|Opinión,blog|opinión|blog|opinión,2021-02-09,UNAM Global,La pandemia derivada de la COVID-19 ha signifi...,Titulo: 0 Comunidades indígenas urbanas en...,352,"[0.0006488185026682913, -0.0057064443826675415..."
1,https://www.milenio.com/negocios/larry-rubin-t...,0,,200,2023-04-02T15:25:07.112501,,bf6c8a5bb799314dcdb37c7085c38a81.html,text/html,utf-8,,...,,Eduardo de la Rosa,Negocios,American Society of Mexico|tatiana clouthier|a...,2022-10-06,Grupo Milenio,El presidente de la American Society of Mexico...,Titulo: 0 Comunidades indígenas urbanas en...,352,"[0.0006488185026682913, -0.0057064443826675415..."
2,https://unamglobal.unam.mx/el-giro-y-el-miedo-...,47,,200,2023-04-02T15:25:09.140811,,99ef9ca8a1832cd7de1d7cf6cb6d54c8.html,text/html,utf-8,,...,,Beto Torres,BLOG|Opinión,blog|opinión|blog|opinión,2021-05-19,UNAM Global,Hace unos dias se celebro en Turin la centesim...,Titulo: 0 Comunidades indígenas urbanas en...,352,"[0.0006488185026682913, -0.0057064443826675415..."


In [4]:
# Remove text on string after '|' symbol
df['clean_title'] = df['title'].apply(lambda x : re.sub(r'\s\|.*', '', x))

In [5]:
embeds = df['embedding_ada']

### Set up Pinecone API key

In [6]:
# Create a parser object and disable interpolation
parser = configparser.ConfigParser(interpolation=None)

# Read data from 'config.ini' file
parser.read("../config.ini")

# Access sections from the configuration file
parser.sections()

# Get 'key' from pinecone section
api_key = parser.get('pinecone', 'key')

### Store vector embeddings with [`Pinecone`](https://www.pinecone.io/)

In [7]:
index_name = 'semantic-search-openai'

# initialize connection to pinecone
pinecone.init(
    api_key=api_key,
    environment="asia-southeast1-gcp"  # find next to api key in console
)

In [8]:
if index_name not in pinecone.list_indexes():
    pinecone.create_index(index_name, dimension=len(embeds[0]), metric="cosine")

In [9]:
# connect to index
index = pinecone.Index(index_name)

In [11]:
meta = [{'content': line} for line in zip(df['clean_title'], df['clean_content'])]
to_upsert = zip(df['index'], embeds, meta)
# upsert to Pinecone
index.upsert(vectors=list(to_upsert))

{'upserted_count': 100}

### Set up OpenAI API key

In [26]:
# Create a parser object and disable interpolation
parser = configparser.ConfigParser(interpolation=None)

# Read data from 'config.ini' file
parser.read("../config.ini")

# Access sections from the configuration file
parser.sections()

# Get 'bearer_token' from twitter section
api_key = parser.get('openai', 'key')

# Set up OpenAI API key
openai.api_key = api_key

In [33]:
query1 = "Arte"
query2 = "Cultura"

In [34]:
emb_q1 = openai.Embedding.create(input=query1, engine="text-embedding-ada-002")['data'][0]['embedding']
emb_q2= openai.Embedding.create(input=query2, engine="text-embedding-ada-002")['data'][0]['embedding']


In [35]:
user_model = np.mean([emb_q1, emb_q2], axis=0).tolist()

In [36]:
res = index.query([user_model], top_k=3, include_metadata=True)

In [37]:
res

{'matches': [{'id': '92',
              'metadata': {'content': ['IKEA “celebra” a Latinoamérica con '
                                       'lanzamiento de colección ÖMSESIDIG',
                                       '- IKEA lanzo OMSESIDIG, su primera '
                                       'coleccion de productos creada en '
                                       'colaboracion con artistas '
                                       'latinoamericanos. - En espanol '
                                       'significa “mutuo” y explora las '
                                       'celebraciones, tradiciones de reunion '
                                       'y cultura en America Latina. - '
                                       'Business Insider Mexico platico con '
                                       'Friso Wiersma, disenador de IKEA of '
                                       'Sweden que dirigio OMSESIDIG, sobre el '
                                       'proceso para elaborar s

In [38]:
for match in res['matches']:
    print(f"index: {match['id']}, score: {match['score']:.2f}, content: {match['metadata']['content'][0]}")

index: 92, score: 0.78, content: IKEA “celebra” a Latinoamérica con lanzamiento de colección ÖMSESIDIG
index: 47, score: 0.78, content: El Giro y el miedo a la bici
index: 2, score: 0.78, content: Astronauta mexicana llevó cenizas de su abuelo al espacio
