#### Build model for sentiment classifier

In [1]:
import pandas as pd
import numpy as np
from transformers import GPT2TokenizerFast
import concurrent.futures
from helper import *
from tqdm import tqdm
import logging
console_logger = logging.getLogger(__name__)
console_logger.setLevel(Config.LOG_LEVEL)
console_logger.addHandler(logging.StreamHandler())

  from .autonotebook import tqdm as notebook_tqdm


## read csv to df

In [2]:
datafile_path = "sentiments_train_test_reviews/reviews_test_4000.csv"
df = pd.read_csv(datafile_path)
df.head()

Unnamed: 0,review_id,review,sentiment
0,T_0,I have to confess that I am severely disappoin...,negative
1,T_9,I have never understood the appeal of this sho...,negative
2,T_12,This is supposed to be based on Wilkie Collins...,negative
3,T_13,Of all the British imperialist movies like Fou...,positive
4,T_15,I loved this film. Not being a swooning Ed Woo...,positive


#### check whether all reviews fall within token_limmit

In [3]:
#tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
encoding_name = "cl100k_base"
#usage for : num_tokens_from_string("tiktoken is great!", "cl100k_base")

#df['n_tokens'] = df.review.apply(lambda x: len(tokenizer.encode(x)))
df['n_tokens'] = df.review.apply(lambda x: num_tokens_from_string(x,encoding_name))
print(df.head())
print((df['n_tokens'] > 8000).sum())

  review_id                                             review sentiment  \
0       T_0  I have to confess that I am severely disappoin...  negative   
1       T_9  I have never understood the appeal of this sho...  negative   
2      T_12  This is supposed to be based on Wilkie Collins...  negative   
3      T_13  Of all the British imperialist movies like Fou...  positive   
4      T_15  I loved this film. Not being a swooning Ed Woo...  positive   

   n_tokens  
0       320  
1       177  
2        84  
3       264  
4       617  
0


In [4]:
print((df['n_tokens'] > 8000).sum())
total_count = df['n_tokens'].sum()
total_count

0


1173420

#### set the model used for generating embeddings

In [5]:
model_name = "GPT3"
engine, MAX_TOKENS,dimensions = embed_gen_model(model_name)
first_time = True  #True if running the notebook for the first time
if first_time:
    (df['n_tokens'] > MAX_TOKENS).sum()
    df.to_csv('partial_embeddings.csv')


In [6]:
print(engine,MAX_TOKENS,dimensions)

text-embedding-ada-002 8191 1536


#### RUN the below cell only for generating embeddings - not always

In [7]:
save_csv_path = 'test_reviews_with_embeddings.csv'
#tqdm.pandas()
if model_name != "GPT3":    
    # use a ThreadPoolExecutor to get the embeddings in parallel - 30secs for 1000 embeddings
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # pass the engine parameter to the get_embeddings function
        results = [executor.submit(get_embeddings, text, engine) for text in df.review]
        df['embeddings'] = [r.result() for r in results]
else:
    #df['embeddings'] = df.review.progress_apply(lambda x: embeddings(x, engine))
    df = pd.read_csv('partial_embeddings.csv')
    if 'embeddings' not in df.columns:
        df['embeddings'] = pd.Series(dtype=object)
    no_of_rows = df['embeddings'].isna().sum()
    console_logger.info(f"Total embeddings to be generated = {no_of_rows}")
    #print(f"Total embeddings to be generated = {no_of_rows}")
    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
        with tqdm(total=no_of_rows) as pbar:
            future_to_index = {executor.submit(embeddings, row['review'],engine): i for i, row in df.iterrows() if pd.isnull(row['embeddings'])}
            for future in concurrent.futures.as_completed(future_to_index):
                i = future_to_index[future]
                try:
                    embedding = future.result()
                    console_logger.debug(f"embeddings of {i} = {embedding}")
                    console_logger.debug(f"shape of df = {df.shape}")#, shape of embeddings = {embeddings.shape}")
                    df.at[i, "embeddings"] = embedding
                    pbar.update()
                    if (i+1) % 1000 == 0:
                        df.to_csv(f"partial_embeddings.csv", index=False)
                        #print(f"{i+1} embeddings generated and saved to partial_embeddings.csv.")
                        console_logger.info(f"{i+1} embeddings generated and saved to partial_embeddings.csv.")
                except Exception as exc:
                    df.to_csv(f"partial_embeddings.csv", index=False)
                    console_logger.error(f"An error occurred: {exc}. Saving partial embeddings to partial_embeddings.csv and exiting.")
df.to_csv(save_csv_path)

Total embeddings to be generated = 4000
 25%|██▌       | 1000/4000 [31:37<1:02:34,  1.25s/it]1000 embeddings generated and saved to partial_embeddings.csv.
 50%|█████     | 2000/4000 [1:05:17<1:22:33,  2.48s/it]2000 embeddings generated and saved to partial_embeddings.csv.
 75%|███████▌  | 3000/4000 [1:40:01<47:38,  2.86s/it]  3000 embeddings generated and saved to partial_embeddings.csv.
100%|██████████| 4000/4000 [2:14:05<00:00,  2.31s/it]  4000 embeddings generated and saved to partial_embeddings.csv.
100%|██████████| 4000/4000 [2:14:13<00:00,  2.01s/it]
