### Imports

In [1]:
import pandas as pd
import tiktoken
import openai
import configparser
import customized_headlines.data.load as load

### Load data

In [2]:
news = load.news
data_processed = load.data_processed

### Set up OpenAI API key

In [3]:
# Create a parser object and disable interpolation
parser = configparser.ConfigParser(interpolation=None)

# Read data from 'config.ini' file
parser.read("../config.ini")

# Access sections from the configuration file
parser.sections()

# Get 'bearer_token' from twitter section
api_key = parser.get('openai', 'key')

# Set up OpenAI API key
openai.api_key = api_key

### Getting embeddings with model [`text-embedding-ada-002`](https://platform.openai.com/docs/guides/embeddings/embedding-models) from OpenAI

In [4]:
# Embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

### Read data

In [5]:
# Read csv file as dataframe
df = pd.read_csv(news)

# Print dataframe shape
print(df.shape)

# Show dataframe
df.head(1)

(100, 22)


Unnamed: 0,url,index,resolved,status,datetime_utc,error,filename,mimetype,encoding,extract_error,...,description,raw_content,comments,author,categories,tags,date,sitename,clean_content,combined
0,https://unamglobal.unam.mx/comunidades-indigen...,46,,200,2023-04-02T15:25:07.419616,,16dd649be430106e823d0f6ad1d7c638.html,text/html,utf-8,,...,La pandemia derivada de la COVID-19 ha signifi...,La pandemia derivada de la COVID-19 ha signifi...,,Beto Torres,BLOG|Opinión,blog|opinión|blog|opinión,2021-02-09,UNAM Global,La pandemia derivada de la COVID-19 ha signifi...,Titulo: 0 Comunidades indígenas urbanas en...


### Process data

In [6]:
encoding = tiktoken.get_encoding(embedding_encoding)
# Remove reviews that are too long to embed
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
df_filtered = df[df.n_tokens <= max_tokens]
len(df_filtered)

100

In [7]:
df_filtered.sort_values(by=['n_tokens'])

Unnamed: 0,url,index,resolved,status,datetime_utc,error,filename,mimetype,encoding,extract_error,...,raw_content,comments,author,categories,tags,date,sitename,clean_content,combined,n_tokens
0,https://unamglobal.unam.mx/comunidades-indigen...,46,,200,2023-04-02T15:25:07.419616,,16dd649be430106e823d0f6ad1d7c638.html,text/html,utf-8,,...,La pandemia derivada de la COVID-19 ha signifi...,,Beto Torres,BLOG|Opinión,blog|opinión|blog|opinión,2021-02-09,UNAM Global,La pandemia derivada de la COVID-19 ha signifi...,Titulo: 0 Comunidades indígenas urbanas en...,352
72,https://www.milenio.com/negocios/amcham-nombra...,29,,200,2023-04-02T15:25:33.972909,,ef01922aa762415be21f6739b9140890.html,text/html,utf-8,,...,La American Chamber of Commerce of Mexico anun...,,Redacción,Negocios,economia|comercio|nombramiento|amcham mexico|e...,2023-01-16,Grupo Milenio,La American Chamber of Commerce of Mexico anun...,Titulo: 0 Comunidades indígenas urbanas en...,352
71,https://www.milenio.com/negocios/cargadores-re...,28,,200,2023-04-02T15:25:33.406106,,e84571e0c6cbcf40b90bae9d8fc70339.html,text/html,utf-8,,...,La venta de cargadores para vehículos eléctric...,,Fernanda Murillo,Negocios,autos electricos|abb|electromovilidad|autos el...,2023-01-13,Grupo Milenio,La venta de cargadores para vehiculos electric...,Titulo: 0 Comunidades indígenas urbanas en...,352
70,https://unamglobal.unam.mx/brasil-prohibe-vend...,62,,200,2023-04-02T15:25:33.914777,,a2fda1c42c79571f51c731a5f2c3df87.html,text/html,utf-8,,...,Se considera una práctica discriminatoria que ...,,Beto Torres,Tecnología|Apple|Brasil prohíbe vender iPhone ...,apple|brasil prohíbe vender iphone sin cargado...,2022-09-06,UNAM Global,Se considera una practica discriminatoria que ...,Titulo: 0 Comunidades indígenas urbanas en...,352
69,https://www.milenio.com/politica/monreal-guard...,27,,200,2023-04-02T15:25:32.731142,,3af9d502b4315edc4af6c470581a1c9a.html,text/html,utf-8,,...,"El coordinador de Morena en el Senado, Ricardo...",,Silvia Arellano,Política,elementos que vigilan el metro|municipios con ...,2023-01-13,Grupo Milenio,"El coordinador de Morena en el Senado, Ricardo...",Titulo: 0 Comunidades indígenas urbanas en...,352
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28,https://businessinsider.mx/errores-65-pelicula...,87,,200,2023-04-02T15:25:17.216540,,c92bf676c5116b155d0254b0a8686ee7.html,text/html,utf-8,,...,- Adam Driver protagoniza una nueva película 6...,,Aline Sarmiento,Finanzas Personales|Vida Profesional|Deportes|...,Adam Driver|Dinosaurios|Hollywood|Películas,2023-04-01,Business Insider México | Noticias pensadas pa...,- Adam Driver protagoniza una nueva pelicula 6...,Titulo: 0 Comunidades indígenas urbanas en...,352
27,https://www.milenio.com/politica/amlo-preve-et...,17,,200,2023-04-02T15:25:16.415802,,9d25f719259fba20b4d74722b51f9e09.html,text/html,utf-8,,...,Al hablar sobre los resultados de su gobierno ...,,Omar Brito,Política,economia|inversion|programas de bienestar|admi...,2022-12-28,Grupo Milenio,Al hablar sobre los resultados de su gobierno ...,Titulo: 0 Comunidades indígenas urbanas en...,352
26,https://unamglobal.unam.mx/el-mundial-de-futbo...,76,,200,2023-04-02T15:25:16.199832,,54c78f904493490b7142ef3f81e0233a.html,text/html,utf-8,,...,El futbol tiene un sentido pedagógico para cre...,,Nelly,Campus|Deportes|Economía|NewsFeed|Imagen,campus|deportes|economía|newsfeed|campus|depor...,2022-12-04,UNAM Global,El futbol tiene un sentido pedagogico para cre...,Titulo: 0 Comunidades indígenas urbanas en...,352
36,https://www.milenio.com/negocios/canada-explor...,11,,200,2023-04-02T15:25:20.191073,,aaff418539ef464346b053be78b4db3b.html,text/html,utf-8,,...,"Mary Ng, ministra de Comercio de Canadá, infor...",,Redacción,Negocios,canada|T-MEC|Mary Ng|Consultas energéticas|can...,2022-11-24,Grupo Milenio,"Mary Ng, ministra de Comercio de Canada, infor...",Titulo: 0 Comunidades indígenas urbanas en...,352


### Get embeddings

In [8]:
def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

# Apply model to get embeddings
df_filtered["embedding_ada"] = df_filtered.combined.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))

In [None]:
df_filtered.to_csv('../data/processed/embeddings.csv', index=False)