In [None]:
import pandas as pd
import numpy as np
raw_df = pd.read_excel('06 Data analysis/00 data/dataset1_8400 tagged.xlsx',engine='openpyxl',sheet_name=1)

In [None]:
#create a unique identifer for each line 
import re
raw_df.reset_index(inplace=True,drop=True)
raw_df["text_id"]=raw_df.apply(lambda row: f"Text_{row.name+1}_{ re.sub(' ','',row['Text'][:5] )}..."  , axis=1   )
raw_df = raw_df[['text_id'] + [col for col in raw_df if col != 'text_id']]
raw_df.to_csv("06 Data analysis/00 data/python_datasets/dataset1_text_ID_created.csv",index=False)
#create a text df only with necessary lines for clustering 
text_df=raw_df[["text_id","Text"]]

## Retrieve embedding using OpenAI's text-embedding-3-large
 The embedding model we use, for example, OpanAI's embedding model here, can actually be used within the bertopic, however, we still choose to get the embedding for our dataset first and separately, and store the embeddings separately, and then pass it into the bertopic pipeline, this makes the workflow safer and more replicable.

In [None]:
# calculate the tokenizer number for each text since embedding models often have a maxium token number limit (though probably not a problem here)

import tiktoken

def num_tokens_from_string(string: str, encoding_name="cl100k_base") -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

text_df["token_num"]=text_df["Text"].apply(num_tokens_from_string )



In [None]:
# determine batch size

def get_max_batch_size(token_num_list, TPM):
    n = len(token_num_list)
    for size in range(1, n+1):  
        valid = True
        for i in range(0, n, size):  
            if sum(token_num_list[i:i+size]) > TPM:
                valid = False
                break
        if not valid:
            return size - 2 
    return n  

batch_size=get_max_batch_size(text_df["token_num"].to_list(), 1000000) 
print(batch_size)


In [None]:
from openai import OpenAI
from tenacity import retry, stop_after_attempt, wait_random_exponential
import numpy as np

def log_before_retry(retry_state):
    exception_message = retry_state.outcome.exception() if retry_state.outcome and retry_state.outcome.exception() else 'No exception'
    print(f"Preparing for Retry {retry_state.attempt_number} due to {exception_message}")


class OpenAI_Embeddinger:
    def __init__(self,answer_text_ds:pd.DataFrame):
        self.client =OpenAI() # Initialize client and model
        self.answer_text_ds=answer_text_ds
        self.answer_text_ds.reset_index(inplace=True,drop=True)
        self.embedding_dict=dict()
       
    @retry(wait=wait_random_exponential(min=1, max=60),
           stop=stop_after_attempt(6),
           before_sleep=log_before_retry)
    def get_embedding_batch(self, texts,embedding_model):
        responses =  self.client.embeddings.create(input=texts, model=embedding_model)
        embeddings = [None] * len(texts)  # Initialize a list to store embeddings in order
        for response in responses.data:
                embeddings[response.index] = np.array(response.embedding)  # Store the embedding using the index to maintain order
        return embeddings

    def append_embedding(self, answer_name_column_name:str,text_column_name: str, model:str,batch_size):
        print(batch_size)
        # Generate batches from the dataset
        texts = self.answer_text_ds[text_column_name].to_list()
        batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
        total_batch_num = len(batches)
        print("calling api started!")
        # Loop through each batch to process and store embeddings
        for batch_number, batch in enumerate(batches, start=1):
            start_index = (batch_number - 1) * batch_size
            end_index = start_index + len(batch)-1
            answer_names=self.answer_text_ds.loc[start_index:end_index,answer_name_column_name]
            embeddings = self.get_embedding_batch(texts=batch, embedding_model=model)
            # we need to store the embeddings back into the dataset
            batch_dict= {name: array for name, array in zip(answer_names, embeddings)}
            self.embedding_dict.update(batch_dict)
            print(f"Batch {batch_number}/{total_batch_num} done!")

        # Indicate the end of the function with a suitable return or simply end the function
        print("All batches processed.")

In [None]:
OpenAI_embedding=OpenAI_Embeddinger(answer_text_ds=text_df)
OpenAI_embedding.append_embedding("text_id","Text","text-embedding-3-large",batch_size=200)

In [None]:
import pickle
with open("06 Data analysis/04 Topic Modeling/outputs/embeddings/ds1_text_embedding", 'wb') as file:
    pickle.dump(OpenAI_embedding.embedding_dict, file, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
#also save in a table together with text 
text_df["embedding"]=text_df["text_id"].map(OpenAI_embedding.embedding_dict)

#transfer the embeddings in numpy array to list so that they can be fully saved in an csv table 
text_df["embedding"]=text_df["embedding"].apply(list)
text_df.to_csv("06 Data analysis/04 Topic Modeling/outputs/embeddings/ds1_text_embedding_with_text.csv",index=False )