### Imports

In [1]:
import tiktoken
import openai
from openai.embeddings_utils import get_embedding
import configparser
import customized_headlines.data.load as load
import customized_headlines.data.export as export

### Load data

In [2]:
news = load.news
data_processed = load.data_processed

### Set up API key

In [66]:
# Create a parser object and disable interpolation
parser = configparser.ConfigParser(interpolation=None)

# Read data from 'config.ini' file
parser.read("../config.ini")

# Access sections from the configuration file
parser.sections()

# Get 'bearer_token' from twitter section
api_key = parser.get('openai', 'key')

# Set up OpenAI API key
openai.api_key = api_key

### Getting embeddings with model [`text-embedding-ada-002`](https://platform.openai.com/docs/guides/embeddings/embedding-models) from OpenAI

In [67]:
# Embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

### Read data

In [68]:
# Read csv file as dataframe
df = load.read_csv(news)
# Show dataframe
df.head(1)

Unnamed: 0,archived_url,article_url,id,language,media_name,media_url,publish_date,title,url,category,text,text_cleanned
0,https://web.archive.org/web/20230307062430/htt...,https://wayback-api.archive.org/colsearch/v1/m...,20230307062430,es,24-horas.mx,http://24-horas.mx,2023-03-06 00:00:00,AMLO; apoya a Samuel García ante conflicto con...,https://www.24-horas.mx/2023/03/06/amlo-apoya-...,negocios,AMLO; apoya a Samuel García ante conflicto con...,AMLO; apoya a Samuel García ante conflicto con...


### Process data

In [69]:
sections = df[["title", "text_cleanned"]]
df["combined"] = f"Title: {df.title.str.strip()}Content: {df.text_cleanned.str.strip()}"
df.head(1)

Unnamed: 0,archived_url,article_url,id,language,media_name,media_url,publish_date,title,url,category,text,text_cleanned,combined
0,https://web.archive.org/web/20230307062430/htt...,https://wayback-api.archive.org/colsearch/v1/m...,20230307062430,es,24-horas.mx,http://24-horas.mx,2023-03-06 00:00:00,AMLO; apoya a Samuel García ante conflicto con...,https://www.24-horas.mx/2023/03/06/amlo-apoya-...,negocios,AMLO; apoya a Samuel García ante conflicto con...,AMLO; apoya a Samuel García ante conflicto con...,Title: 0 AMLO; apoya a Samuel García ante...


In [71]:
encoding = tiktoken.get_encoding(embedding_encoding)
# Remove reviews that are too long to embed
df["n_tokens"] = df.text_cleanned.apply(lambda x: len(encoding.encode(x)))
df_filtered = df[df.n_tokens <= max_tokens]
len(df_filtered)

198

In [72]:
df.sort_values(by=['n_tokens'])

Unnamed: 0,archived_url,article_url,id,language,media_name,media_url,publish_date,title,url,category,text,text_cleanned,combined,n_tokens
92,https://web.archive.org/web/20230306084208/htt...,https://wayback-api.archive.org/colsearch/v1/m...,20230306084208,es,elgrafico.mx,http://elgrafico.mx,2023-03-04 00:00:00,"8M 2023 Dra. Verónica Vázquez, ayuda a mujeres...",https://www.elgrafico.mx/historias/doctora-ver...,ciencia,"8M 2023 Dra. Verónica Vázquez, ayuda a mujeres...","8M 2023 Dra. Verónica Vázquez, ayuda a mujeres...",Title: 0 AMLO; apoya a Samuel García ante...,184
99,https://web.archive.org/web/20230308073355/htt...,https://wayback-api.archive.org/colsearch/v1/m...,20230308073355,es,elfinanciero.com.mx,http://elfinanciero.com.mx,2023-03-07 00:00:00,Superpeso – El Financiero,https://www.elfinanciero.com.mx/cartones/rictu...,ciencia,Superpeso – El Financiero 2 captures 08 Mar 20...,Superpeso – El Financiero Economía Mercados N...,Title: 0 AMLO; apoya a Samuel García ante...,318
82,https://web.archive.org/web/20230308073355/htt...,https://wayback-api.archive.org/colsearch/v1/m...,20230308073355,es,elfinanciero.com.mx,http://elfinanciero.com.mx,2023-03-07 00:00:00,Superpeso – El Financiero,https://www.elfinanciero.com.mx/cartones/rictu...,ciencia,Superpeso – El Financiero 2 captures 08 Mar 20...,Superpeso – El Financiero Economía Mercados N...,Title: 0 AMLO; apoya a Samuel García ante...,318
26,https://web.archive.org/web/20230309111932/htt...,https://wayback-api.archive.org/colsearch/v1/m...,20230309111932,es,elfinanciero.com.mx,http://elfinanciero.com.mx,2023-03-08 00:00:00,El infierno – El Financiero,https://www.elfinanciero.com.mx/cartones/rictu...,entretenimeinto,El infierno – El Financiero 2 captures 09 Mar ...,El infierno – El Financiero Economía Mercados...,Title: 0 AMLO; apoya a Samuel García ante...,325
22,https://web.archive.org/web/20230308072513/htt...,https://wayback-api.archive.org/colsearch/v1/m...,20230308072513,es,elfinanciero.com.mx,http://elfinanciero.com.mx,2023-03-07 00:00:00,Ahorros – El Financiero,https://www.elfinanciero.com.mx/cartones/anton...,entretenimeinto,Ahorros – El Financiero 2 captures 08 Mar 2023...,Ahorros – El Financiero Economía Mercados Nac...,Title: 0 AMLO; apoya a Samuel García ante...,326
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23,https://web.archive.org/web/20230304100138/htt...,https://wayback-api.archive.org/colsearch/v1/m...,20230304100138,es,milenio.com,http://milenio.com,2023-03-03 00:00:00,Cine y mujeres: México a través de la mirada f...,https://www.milenio.com/cultura/laberinto/cine...,entretenimeinto,Cine y mujeres: México a través de la mirada f...,Cine y mujeres: México a través de la mirada f...,Title: 0 AMLO; apoya a Samuel García ante...,4588
24,https://web.archive.org/web/20230304100138/htt...,https://wayback-api.archive.org/colsearch/v1/m...,20230304100138,es,milenio.com,http://milenio.com,2023-03-03 00:00:00,Cine y mujeres: México a través de la mirada f...,https://www.milenio.com/cultura/laberinto/cine...,entretenimeinto,Cine y mujeres: México a través de la mirada f...,Cine y mujeres: México a través de la mirada f...,Title: 0 AMLO; apoya a Samuel García ante...,4588
194,https://web.archive.org/web/20230304100138/htt...,https://wayback-api.archive.org/colsearch/v1/m...,20230304100138,es,milenio.com,http://milenio.com,2023-03-03 00:00:00,Cine y mujeres: México a través de la mirada f...,https://www.milenio.com/cultura/laberinto/cine...,arte,Cine y mujeres: México a través de la mirada f...,Cine y mujeres: México a través de la mirada f...,Title: 0 AMLO; apoya a Samuel García ante...,4588
105,https://web.archive.org/web/20230308073032/htt...,https://wayback-api.archive.org/colsearch/v1/m...,20230308073032,es,impacto.mx,http://impacto.mx,2023-03-07 00:00:00,"Carlos Natarén, el farsante de la Universidad ...",https://impacto.mx/la-revista/carlos-nataren-e...,cultura,"Carlos Natarén, el farsante de la Universidad ...","Carlos Natarén, el farsante de la Universidad ...",Title: 0 AMLO; apoya a Samuel García ante...,9247


### Get embeddings

In [44]:
# Apply model to get embeddings
df["embeddings_ada"] = df.text_cleanned.apply(lambda x: get_embedding(x, model=embedding_model))

InvalidRequestError: This model's maximum context length is 8191 tokens, however you requested 10816 tokens (10816 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.

In [None]:
export.df_csv(df, news)