### Working Environment

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/1-GenAI-HandsOn/5-SentimentAnalysis-LLM
!ls

/content/drive/MyDrive/1-GenAI-HandsOn/5-SentimentAnalysis-LLM
amazon_alexa.tsv  amazon_f_handson.ipynb


### Import Dataset

In [8]:
import pandas as pd

data = pd.read_csv('/Users/guptatilak/Documents/SentimentAnalysis-Createch/amazon_alexa.tsv', sep='\t')
data.sample(10)

Unnamed: 0,position,title,snippet,link,displayed_link,highlighs
45,45,Chennai (MAA) Airport,@larsentoubro. for achieving 10 million safe m...,https://twitter.com/aaichnairport/status/14554...,https://twitter.com › aaichnairport › status,['larsentoubro']
27,27,BharatShakti.in,@larsentoubro. for procurement of Close-in Wea...,https://twitter.com/BharatShaktiBSI/status/176...,https://twitter.com › BharatShaktiBSI › status,['larsentoubro']
67,67,Isn't L&T already building nuke subs? that lin...,Aryan_warlord @larsentoubro @MazagonDockLtd @D...,https://twitter.com/VkJoisG/status/17652406659...,https://twitter.com › VkJoisG › status,
23,23,Moneycontrol,larsentoubro @rachitaprasad @Yaruqh_K L&T Ltd ...,https://twitter.com/moneycontrolcom/status/175...,https://twitter.com › moneycontrolcom › status,
72,72,McPhy,@larsentoubro . L&T recently commissioned its ...,https://twitter.com/McPhyEnergy/status/1764928...,https://twitter.com › McPhyEnergy › status,['larsentoubro']
2,2,Financial Express,@larsentoubro. ) announced that its Hydrocarbo...,https://twitter.com/FinancialXpress/status/176...,https://twitter.com › FinancialXpress › status,['larsentoubro']
69,69,India Smart Utility Week - ISUW,... Delhi Join #ISUW24 as a Partner if your en...,https://twitter.com/ISUW_India/status/17635120...,https://twitter.com › ISUW_India › status,['larsentoubro']
57,57,Shivakumar C,... hardware was produced at L&T's facility in...,https://twitter.com/i/web/status/1679524872500...,https://twitter.com › web › status,['larsentoubro']
0,0,Larsen & Toubro (@larsentoubro) / ...,The official Larsen & Toubro Corporate handle....,https://twitter.com/larsentoubro,https://twitter.com › larsentoubro,
18,18,ET NOW,Larsen commisions hydrogen electrolyser at Haz...,https://twitter.com/ETNOWlive/status/176341118...,https://twitter.com › ETNOWlive › status,['larsentoubro']


In [9]:
mydata = data[['verified_reviews','feedback']]
mydata.columns = ['review','label']

mydata.head()

Unnamed: 0,review,label
0,Love my Echo!,1
1,Loved it!,1
2,"Sometimes while playing a game, you can answer...",1
3,I have had a lot of fun with this thing. My 4 ...,1
4,Music,1


In [10]:
mydata.value_counts('label')

label
1    2893
0     257
dtype: int64

In [11]:
# Count the occurrences of each label
label_counts = mydata["label"].value_counts()

# Get the number of rows to drop from the majority class
rows_to_drop = label_counts.max() - label_counts.min()

# Drop rows from the majority class randomly
if rows_to_drop > 0:
   data_majority = mydata[mydata["label"] == 1]
   data_balanced = mydata.drop(data_majority.sample(rows_to_drop).index)
else:
   data_balanced = mydata.copy()

# Check the new class balance
print(data_balanced["label"].value_counts())

1    257
0    257
Name: label, dtype: int64


## Data Preprocessing

In [12]:
import re

def clean_text(text):
  # Remove special characters and punctuation
  text = re.sub(r"[^\w\s]", " ", text)

  # Remove single characters
  text = re.sub(r"\b[a-zA-Z]\b", " ", text)

  # Remove HTML tags
  text = re.sub(r"<[^>]*>", " ", text)

  # Lowercase the text
  text = text.lower()

  # Remove extra whitespace
  text = re.sub(r"\s+", " ", text)

  # Trim leading and trailing spaces
  text = text.strip()

  return text

In [13]:
import pandas as pd

# Extract the review column as a list
reviews = data_balanced['review'].tolist()

# Clean the text in the list
cleaned_reviews = [clean_text(review) for review in reviews]

# Add the cleaned reviews as a new column to the DataFrame
data_balanced['clean_reviews'] = cleaned_reviews

In [14]:
data_balanced

Unnamed: 0,review,label,clean_reviews
1,Loved it!,1,loved it
13,"Love, Love, Love!!",1,love love love
17,We have only been using Alexa for a couple of ...,1,we have only been using alexa for couple of da...
23,I love it. It plays my sleep sounds immediatel...,1,love it it plays my sleep sounds immediately w...
46,"It's like Siri, in fact, Siri answers more acc...",0,it like siri in fact siri answers more accurat...
...,...,...,...
3092,"I love this technology. I'm older, but this is...",1,love this technology older but this is so easy...
3093,Use as my 3rd dot. Bought so my husband would ...,1,use as my 3rd dot bought so my husband would b...
3096,The product sounded the same as the emoji spea...,0,the product sounded the same as the emoji spea...
3123,,1,


## Data Split

In [15]:
import pandas as pd

# Assuming your DataFrame is called "df"
total_rows = len(data_balanced)
test_size = int(total_rows * 0.95)

# Randomly sample train_size rows for the training set
test_set = data_balanced.sample(test_size)

# Get the remaining rows for the test set
train_set = data_balanced.drop(test_set.index)

## Sentiment w/ LLM

### Setting up Gemini API

In [16]:
!pip install -q -U google-generativeai

[0m

In [17]:
# Necessary packages
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

# # Used to securely store your API key
# from google.colab import userdata

In [18]:
# Or use `os.getenv('GOOGLE_API_KEY')` to fetch an environment variable.
GOOGLE_API_KEY='AIzaSyCK7ZAeFs8CwtzTvRqkEMGxxd_hUXMsR70'

genai.configure(api_key=GOOGLE_API_KEY)

In [19]:
for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)

models/gemini-1.0-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-latest
models/gemini-1.0-pro-vision-latest
models/gemini-pro
models/gemini-pro-vision


In [20]:
model = genai.GenerativeModel('gemini-pro')

In [21]:
%%time
response = model.generate_content("What is the meaning of life?")

to_markdown(response.text)

CPU times: user 11.1 ms, sys: 10.1 ms, total: 21.2 ms
Wall time: 12.9 s


> The meaning of life is a philosophical question that has been asked for centuries. There is no single answer that is universally agreed upon, as the meaning of life is subjective and personal to each individual. However, there are many different perspectives on the meaning of life, including:
> 
> * **The pursuit of happiness:** Some people believe that the meaning of life is to be happy. This could mean pursuing personal goals, spending time with loved ones, or simply enjoying the present moment.
> * **The fulfillment of one's potential:** Others believe that the meaning of life is to fulfill one's potential. This could mean pursuing a career, achieving a goal, or simply becoming the best version of oneself.
> * **The service of others:** Many people find meaning in life through service to others. This could mean volunteering, donating to charity, or simply helping out friends and family.
> * **The quest for knowledge:** Some people find meaning in life through the pursuit of knowledge. This could mean studying a particular subject, reading books, or exploring new ideas.
> * **The connection to something greater than oneself:** Many people find meaning in life through their connection to something greater than themselves. This could be a religious faith, a spiritual practice, or simply a sense of belonging to the universe.
> 
> Ultimately, the meaning of life is something that each individual must discover for themselves. There is no right or wrong answer, and the meaning of life can change over time. The important thing is to find something that gives your life purpose and makes you feel fulfilled.

#### Single API Call

In [22]:
test_set_sample = test_set.sample(20)

test_set_sample['pred_label'] = ''

test_set_sample

Unnamed: 0,review,label,clean_reviews,pred_label
1036,Alexa hardly came on..,0,alexa hardly came on,
2697,NOT CONNECTED TO MY PHONE PLAYLIST :(,0,not connected to my phone playlist,
1240,I haven't figured out how to make or receive c...,0,haven figured out how to make or receive calls...,
579,"great product, but useless overall. Too many u...",0,great product but useless overall too many unn...,
1232,Fun so far...still learning how it all works,1,fun so far still learning how it all works,
1236,I would love this but there is no way to stop ...,0,would love this but there is no way to stop th...,
1389,,0,,
857,"Stopped working after 2 weeks ,didn't follow c...",0,stopped working after 2 weeks didn follow comm...,
1334,What a wonderful little Alexa enabled gadget a...,1,what wonderful little alexa enabled gadget at ...,
1689,"Works fine, I just realize I don’t need this b...",0,works fine just realize don need this because ...,


In [23]:
# Convert the DataFrame to JSON using the to_json() method

json_data = test_set_sample[['clean_reviews','pred_label']].to_json(orient='records')

# Print the JSON data
print(json_data)

[{"clean_reviews":"alexa hardly came on","pred_label":""},{"clean_reviews":"not connected to my phone playlist","pred_label":""},{"clean_reviews":"haven figured out how to make or receive calls device tells me need to register and do not know what to do","pred_label":""},{"clean_reviews":"great product but useless overall too many unnecessary features unless you have smart home you don need it","pred_label":""},{"clean_reviews":"fun so far still learning how it all works","pred_label":""},{"clean_reviews":"would love this but there is no way to stop the screen from constantly scrolling through things to try if amazon would make this setting would change my review to 5 stars","pred_label":""},{"clean_reviews":"","pred_label":""},{"clean_reviews":"stopped working after 2 weeks didn follow commands really fun when it was working","pred_label":""},{"clean_reviews":"what wonderful little alexa enabled gadget at like the analog clock face the best","pred_label":""},{"clean_reviews":"works fi

In [24]:
prompt = f"""
You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
Customer reviews are provided between three back ticks.
In your output, only return the Json code back as output - which is provided between three backticks.
Your task is to update predicted labels under 'pred_label' in the Json code.
Don't make any changes to Json code format, please.

```
{json_data}
```
"""

print(prompt)


You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
Customer reviews are provided between three back ticks.
In your output, only return the Json code back as output - which is provided between three backticks.
Your task is to update predicted labels under 'pred_label' in the Json code.
Don't make any changes to Json code format, please.

```
[{"clean_reviews":"alexa hardly came on","pred_label":""},{"clean_reviews":"not connected to my phone playlist","pred_label":""},{"clean_reviews":"haven figured out how to make or receive calls device tells me need to register and do not know what to do","pred_label":""},{"clean_reviews":"great product but useless overall too many unnecessary features unless you have smart home you don need it","pred_label":""},{"clean_reviews":"fun so far still learning how it all works","pred_label":""},{"clean_reviews":"

In [25]:
response = model.generate_content(prompt)

print(response.text)

```
[{"clean_reviews":"alexa hardly came on","pred_label":0},{"clean_reviews":"not connected to my phone playlist","pred_label":0},{"clean_reviews":"haven figured out how to make or receive calls device tells me need to register and do not know what to do","pred_label":0},{"clean_reviews":"great product but useless overall too many unnecessary features unless you have smart home you don need it","pred_label":0},{"clean_reviews":"fun so far still learning how it all works","pred_label":1},{"clean_reviews":"would love this but there is no way to stop the screen from constantly scrolling through things to try if amazon would make this setting would change my review to 5 stars","pred_label":0},{"clean_reviews":"","pred_label":0},{"clean_reviews":"stopped working after 2 weeks didn follow commands really fun when it was working","pred_label":0},{"clean_reviews":"what wonderful little alexa enabled gadget at like the analog clock face the best","pred_label":1},{"clean_reviews":"works fine ju

In [26]:
import json

# Clean the data by stripping the backticks
json_data = response.text.strip("`")

# Load the cleaned data and convert to DataFrame
data = json.loads(json_data)
df_sample = pd.DataFrame(data)

df_sample

Unnamed: 0,clean_reviews,pred_label
0,alexa hardly came on,0
1,not connected to my phone playlist,0
2,haven figured out how to make or receive calls...,0
3,great product but useless overall too many unn...,0
4,fun so far still learning how it all works,1
5,would love this but there is no way to stop th...,0
6,,0
7,stopped working after 2 weeks didn follow comm...,0
8,what wonderful little alexa enabled gadget at ...,1
9,works fine just realize don need this because ...,0


In [27]:
# prompt: Overwrite pred_label from 'df' into pred_label in 'train_set_sample'

test_set_sample['pred_label'] = df_sample['pred_label'].values
test_set_sample

Unnamed: 0,review,label,clean_reviews,pred_label
1036,Alexa hardly came on..,0,alexa hardly came on,0
2697,NOT CONNECTED TO MY PHONE PLAYLIST :(,0,not connected to my phone playlist,0
1240,I haven't figured out how to make or receive c...,0,haven figured out how to make or receive calls...,0
579,"great product, but useless overall. Too many u...",0,great product but useless overall too many unn...,0
1232,Fun so far...still learning how it all works,1,fun so far still learning how it all works,1
1236,I would love this but there is no way to stop ...,0,would love this but there is no way to stop th...,0
1389,,0,,0
857,"Stopped working after 2 weeks ,didn't follow c...",0,stopped working after 2 weeks didn follow comm...,0
1334,What a wonderful little Alexa enabled gadget a...,1,what wonderful little alexa enabled gadget at ...,1
1689,"Works fine, I just realize I don’t need this b...",0,works fine just realize don need this because ...,0


In [28]:
# Plotting confusion matrix on the predictions

from sklearn.metrics import confusion_matrix

y_true = test_set_sample["label"]
y_pred = test_set_sample["pred_label"]

confusion_matrix(y_true, y_pred)

array([[13,  0],
       [ 0,  7]])

### OpenAI API Config

In [43]:
!pip install openai==0.27.0

[0mCollecting openai==0.27.0
  Downloading openai-0.27.0-py3-none-any.whl (70 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.1/70.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[0mInstalling collected packages: openai
  Attempting uninstall: openai
[0m    Found existing installation: openai 0.28.1
    Uninstalling openai-0.28.1:
      Successfully uninstalled openai-0.28.1
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llm-app 0.3.4 requires openai<0.29.0,>=0.27.8, but you have openai 0.27.0 which is incompatible.
llm-app 0.3.4 requires requests<3.0.0,>=2.31.0, but you have requests 2.28.1 which is incompatible.[0m[31m
[0mSuccessfully installed openai-0.27.0
[0m

In [52]:
import openai
# from google.colab import userdata

OPENAI_API_KEY='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'
openai.api_key  = OPENAI_API_KEY

In [53]:
def get_completion(prompt, model="gpt-3.5-turbo-1106"):

  messages = [{"role": "user", "content": prompt}]
  response = openai.ChatCompletion.create(model=model,messages=messages,temperature=0)

  return response.choices[0].message["content"]

In [54]:
prompt = "Why is the sky blue?"

chatgpt_response = get_completion(prompt)

RateLimitError: You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.

In [28]:
chatgpt_response

"The sky appears blue to our eyes because of the way the Earth's atmosphere scatters sunlight. The molecules in the Earth's atmosphere, particularly nitrogen and oxygen, scatter shorter wavelengths of light (blue and violet) more effectively than longer wavelengths (red and yellow). This scattering causes the blue light to be more visible and gives the sky its blue color. This effect is known as Rayleigh scattering."

#### Batching API Calls (Single Shot)

In [29]:
test_set.shape

(488, 3)

In [30]:
test_set_total = test_set.sample(100)

test_set_total['pred_label'] = ''

test_set_total

Unnamed: 0,review,label,clean_reviews,pred_label
2786,Very convenient,1,very convenient,
1913,Excellent why didn’t I think it is in the begi...,1,excellent why didn think it is in the beginnin...,
2621,Love it,1,love it,
672,Bought this to go in my niece's room. You can'...,1,bought this to go in my niece room you can tel...,
2225,"The current demand for this stick, was too hig...",1,the current demand for this stick was too high...,
...,...,...,...,...
653,I wanted a white dot for my white bathroom. T...,0,wanted white dot for my white bathroom the top...,
857,"Stopped working after 2 weeks ,didn't follow c...",0,stopped working after 2 weeks didn follow comm...,
2005,Why do we need to buy a $100 hub to get it to ...,0,why do we need to buy 100 hub to get it to wor...,
1342,I am very excited and happy with this. It was ...,1,am very excited and happy with this it was bre...,


In [31]:
batches = []
batch_size = 50

for i in range(0, len(test_set_total), batch_size):
  batches.append(test_set_total[i : i + batch_size])  # Append batches instead of assigning

In [32]:
import time

def gpt_completion_function(batch,current_batch,total_batch,model="gpt-3.5-turbo-1106"):
  """Function works in three steps:
  # Step-1: Convert the DataFrame to JSON using the to_json() method.
  # Step-2: Preparing the Gemini Prompt
  # Step-3: Calling GPT API
  """

  print(f"Now processing batch#: {current_batch+1} of {total_batch}")

  json_data = batch[['clean_reviews','pred_label']].to_json(orient='records')

  prompt = f"""You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
  Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
  Customer reviews are provided between three backticks below.
  In your output, only return the Json code back as output - which is provided between three backticks.
  Your task is to update predicted labels under 'pred_label' in the Json code.
  Don't make any changes to Json code format, please.
  Error handling instruction: In case a Customer Review violates API policy, please assign it default sentiment as Negative (label=0).

  ```
  {json_data}
  ```
  """

  print(prompt)

  messages = [{"role": "user", "content": prompt}]
  response = openai.ChatCompletion.create(model=model,messages=messages,temperature=0)
  time.sleep(5)
  return response.choices[0].message["content"]

In [33]:
batch_count = len(batches)
responses = []

for i in range(0,len(batches)):
  responses.append(gpt_completion_function(batches[i],i,batch_count))

Now processing batch#: 1 of 2
You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
  Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
  Customer reviews are provided between three backticks below.
  In your output, only return the Json code back as output - which is provided between three backticks.
  Your task is to update predicted labels under 'pred_label' in the Json code.
  Don't make any changes to Json code format, please.
  Error handling instruction: In case a Customer Review violates API policy, please assign it default sentiment as Negative (label=0).

  ```
  [{"clean_reviews":"very convenient","pred_label":""},{"clean_reviews":"excellent why didn think it is in the beginning love it","pred_label":""},{"clean_reviews":"love it","pred_label":""},{"clean_reviews":"bought this to go in my niece room you can tell it refurbished it looks good and works like new","pred_label":""},{"clean_

In [34]:
import json

df_total = pd.DataFrame()  # Initialize an empty DataFrame

for response in responses:
  # Clean the data by stripping the backticks
  json_data = response.strip("`")

  # Load the cleaned data and convert to DataFrame
  data = json.loads(json_data)
  df_temp = pd.DataFrame(data)

  # Append the DataFrame to the final DataFrame
  df_total = df_total.append(df_temp, ignore_index=True)

print(df_total)  # Display the final DataFrame

                                        clean_reviews  pred_label
0                                     very convenient           1
1   excellent why didn think it is in the beginnin...           1
2                                             love it           1
3   bought this to go in my niece room you can tel...           1
4   the current demand for this stick was too high...           0
..                                                ...         ...
95  wanted white dot for my white bathroom the top...           0
96  stopped working after 2 weeks didn follow comm...           0
97  why do we need to buy 100 hub to get it to wor...           0
98  am very excited and happy with this it was bre...           1
99  in one word amazing best tech purchase have ev...           1

[100 rows x 2 columns]


  df_total = df_total.append(df_temp, ignore_index=True)
  df_total = df_total.append(df_temp, ignore_index=True)


In [35]:
# prompt: Overwrite pred_label from 'df' into pred_label in 'train_set_sample'

test_set_total['pred_label'] = df_total['pred_label'].values
test_set_total

Unnamed: 0,review,label,clean_reviews,pred_label
2786,Very convenient,1,very convenient,1
1913,Excellent why didn’t I think it is in the begi...,1,excellent why didn think it is in the beginnin...,1
2621,Love it,1,love it,1
672,Bought this to go in my niece's room. You can'...,1,bought this to go in my niece room you can tel...,1
2225,"The current demand for this stick, was too hig...",1,the current demand for this stick was too high...,0
...,...,...,...,...
653,I wanted a white dot for my white bathroom. T...,0,wanted white dot for my white bathroom the top...,0
857,"Stopped working after 2 weeks ,didn't follow c...",0,stopped working after 2 weeks didn follow comm...,0
2005,Why do we need to buy a $100 hub to get it to ...,0,why do we need to buy 100 hub to get it to wor...,0
1342,I am very excited and happy with this. It was ...,1,am very excited and happy with this it was bre...,1


In [36]:
# Plotting confusion matrix on the predictions

from sklearn.metrics import confusion_matrix, accuracy_score

y_true = test_set_total["label"]
y_pred = test_set_total["pred_label"]

print(confusion_matrix(y_true, y_pred))
print(f"\nAccuracy: {accuracy_score(y_true, y_pred)}")

[[46  0]
 [ 9 45]]

Accuracy: 0.91


### Batching API Calls: Gemini API

In [29]:
test_set.shape

(488, 3)

In [30]:
test_set_total = test_set.sample(100)

test_set_total['pred_label'] = ''

test_set_total

Unnamed: 0,review,label,clean_reviews,pred_label
2927,Work pretty well,1,work pretty well,
2195,It still doesn’t work. I bet if I call and le...,0,it still doesn work bet if call and let them k...,
2541,Cheap and cheap sound.,0,cheap and cheap sound,
2928,Still learning it,1,still learning it,
661,This Echo Dot is horrible. The volume on my ph...,0,this echo dot is horrible the volume on my pho...,
...,...,...,...,...
3067,The only negative we have on this product is t...,0,the only negative we have on this product is t...,
1276,Love this little dot with a screen. Super eas...,1,love this little dot with screen super easy to...,
381,It worked for a month or so then it stopped. I...,0,it worked for month or so then it stopped ve t...,
2787,Este producto llegó y a la semana se quedó sin...,1,este producto llegó la semana se quedó sin olo...,


In [31]:
batches = []
batch_size = 25

for i in range(0, len(test_set_total), batch_size):
  batches.append(test_set_total[i : i + batch_size])  # Append batches instead of assigning

In [33]:
import time

def gemini_completion_function(batch,current_batch,total_batch):
  """Function works in three steps:
  # Step-1: Convert the DataFrame to JSON using the to_json() method.
  # Step-2: Preparing the Gemini Prompt
  # Step-3: Calling Gemini API
  """

  print(f"Now processing batch#: {current_batch+1} of {total_batch}")

  json_data = batch[['clean_reviews','pred_label']].to_json(orient='records')

  prompt = f"""You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
  Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
  Customer reviews are provided between three backticks below.
  In your output, only return the Json code back as output - which is provided between three backticks.
  Your task is to update predicted labels under 'pred_label' in the Json code.
  Don't make any changes to Json code format, please.
  Error handling instruction: In case a Customer Review violates API policy, please assign it default sentiment as Negative (label=0).

  ```
  {json_data}
  ```
  """

  print(prompt)
  response = model.generate_content(prompt)
  time.sleep(5)

  return response

In [38]:
batch_count = len(batches)
responses = []

for i in range(0,len(batches)):
  responses.append(gemini_completion_function(batches[i],i,batch_count))

Now processing batch#: 1 of 4
You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
  Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
  Customer reviews are provided between three backticks below.
  In your output, only return the Json code back as output - which is provided between three backticks.
  Your task is to update predicted labels under 'pred_label' in the Json code.
  Don't make any changes to Json code format, please.
  Error handling instruction: In case a Customer Review violates API policy, please assign it default sentiment as Negative (label=0).

  ```
  [{"clean_reviews":"work pretty well","pred_label":""},{"clean_reviews":"it still doesn work bet if call and let them know that they will charge me for the 3rd time ridiculous","pred_label":""},{"clean_reviews":"cheap and cheap sound","pred_label":""},{"clean_reviews":"still learning it","pred_label":""},{"clean_reviews":"this e

In [39]:
import json

df_total = pd.DataFrame()  # Initialize an empty DataFrame

for response in responses:
  # Clean the data by stripping the backticks
  json_data = response.text.strip("`")

  # Load the cleaned data and convert to DataFrame
  data = json.loads(json_data)
  df_temp = pd.DataFrame(data)

  # Append the DataFrame to the final DataFrame
  df_total = df_total.append(df_temp, ignore_index=True)

print(df_total)  # Display the final DataFrame

  df_total = df_total.append(df_temp, ignore_index=True)
  df_total = df_total.append(df_temp, ignore_index=True)


ValueError: The `response.parts` quick accessor only works for a single candidate, but none were returned. Check the `response.prompt_feedback` to see if the prompt was blocked.

In [40]:
response.prompt_feedback

block_reason: SAFETY
safety_ratings {
  category: HARM_CATEGORY_SEXUALLY_EXPLICIT
  probability: NEGLIGIBLE
}
safety_ratings {
  category: HARM_CATEGORY_HATE_SPEECH
  probability: LOW
}
safety_ratings {
  category: HARM_CATEGORY_HARASSMENT
  probability: MEDIUM
}
safety_ratings {
  category: HARM_CATEGORY_DANGEROUS_CONTENT
  probability: NEGLIGIBLE
}

In [41]:
# prompt: Overwrite pred_label from 'df' into pred_label in 'train_set_sample'

test_set_total['pred_label'] = df_total['pred_label'].values
test_set_total

ValueError: Length of values (50) does not match length of index (100)

In [42]:
# Plotting confusion matrix on the predictions

from sklearn.metrics import confusion_matrix

y_true = test_set_total["label"]
y_pred = test_set_total["pred_label"]

confusion_matrix(y_true, y_pred)

TypeError: Labels in y_true and y_pred should be of the same type. Got y_true=[0 1] and y_pred=['']. Make sure that the predictions provided by the classifier coincides with the true labels.

## Batching API Calls: ChatGPT (Few Shot)

In [80]:
test_set.shape

(488, 3)

In [81]:
test_set_total = test_set.sample(100)

test_set_total['pred_label'] = ''

test_set_total

Unnamed: 0,review,label,clean_reviews,pred_label
1267,I switched to google. The amazon helper is mor...,0,switched to google the amazon helper is more f...,
631,Eh. It’s works on and off. Half the time it do...,0,eh it works on and off half the time it does n...,
1742,I have been waiting for the Echo Show to go on...,1,have been waiting for the echo show to go on s...,
1563,It's like having another kid in the house; I h...,0,it like having another kid in the house have t...,
1517,Love my echo show! Great sound and picture. Do...,1,love my echo show great sound and picture does...,
...,...,...,...,...
2560,Love the echo dot it’s amaxing!!!,1,love the echo dot it amaxing,
2013,I bought this for myself and i didn’t realize ...,0,bought this for myself and didn realize it had...,
2520,Best purchase this year.,1,best purchase this year,
1200,Meh,0,meh,


In [82]:
batches = []
batch_size = 50

for i in range(0, len(test_set_total), batch_size):
  batches.append(test_set_total[i : i + batch_size])  # Append batches instead of assigning

In [83]:
import time

def gpt_completion_function(batch,current_batch,total_batch,train_sample,model="gpt-3.5-turbo-1106"):
  """Function works in three steps:
  # Step-1: Convert the DataFrame to JSON using the to_json() method.
  # Step-2: Preparing the Gemini Prompt
  # Step-3: Calling GPT API
  """

  print(f"Now processing batch#: {current_batch+1} of {total_batch}")

  json_data = batch[['clean_reviews','pred_label']].to_json(orient='records')

  sample_json_data = train_sample[['clean_reviews','label']].to_json(orient='records')

  prompt = f"""You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
  Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
  Customer reviews are provided between three backticks below.
  In your output, only return the Json code back as output - which is provided between three backticks.
  Your task is to update predicted labels under 'pred_label' in the Json code.
  Don't make any changes to Json code format, please.
  Error handling instruction: In case a Customer Review violates API policy, please assign it default sentiment as Negative (label=0).
  Examples of good Sentiment Analysis Classification are provided between separator ####.
  These examples are for your reference, not to be included in your final output.

  ```
  {json_data}
  ```
  ####
  {sample_json_data}
  ####
  """

  print(prompt)

  messages = [{"role": "user", "content": prompt}]
  response = openai.ChatCompletion.create(model=model,messages=messages,temperature=0)
  time.sleep(5)
  return response.choices[0].message["content"]

In [84]:
train_sample = train_set.sample(4)

batch_count = len(batches)
responses = []

for i in range(0,len(batches)):
  responses.append(gpt_completion_function(batches[i],i,batch_count,train_sample))

Now processing batch#: 1 of 2
You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
  Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
  Customer reviews are provided between three backticks below.
  In your output, only return the Json code back as output - which is provided between three backticks.
  Your task is to update predicted labels under 'pred_label' in the Json code.
  Don't make any changes to Json code format, please.
  Error handling instruction: In case a Customer Review violates API policy, please assign it default sentiment as Negative (label=0).
  Examples of good Sentiment Analysis Classification are provided between separator ####.
  These examples are for your reference, not to be included in your final output.

  ```
  [{"clean_reviews":"switched to google the amazon helper is more for shopping and google is the tasks assistant","pred_label":""},{"clean_reviews":"eh it work

In [85]:
import json

df_total = pd.DataFrame()  # Initialize an empty DataFrame

for response in responses:
  # Clean the data by stripping the backticks
  json_data = response.strip("`")

  # Load the cleaned data and convert to DataFrame
  data = json.loads(json_data)
  df_temp = pd.DataFrame(data)

  # Append the DataFrame to the final DataFrame
  df_total = df_total.append(df_temp, ignore_index=True)

print(df_total)  # Display the final DataFrame

                                        clean_reviews  pred_label
0   switched to google the amazon helper is more f...           0
1   eh it works on and off half the time it does n...           0
2   have been waiting for the echo show to go on s...           1
3   it like having another kid in the house have t...           0
4   love my echo show great sound and picture does...           1
..                                                ...         ...
95                       love the echo dot it amaxing           1
96  bought this for myself and didn realize it had...           0
97                            best purchase this year           1
98                                                meh           0
99  so far so good much better sound than my echo dot           1

[100 rows x 2 columns]


  df_total = df_total.append(df_temp, ignore_index=True)
  df_total = df_total.append(df_temp, ignore_index=True)


In [86]:
# prompt: Overwrite pred_label from 'df' into pred_label in 'train_set_sample'

test_set_total['pred_label'] = df_total['pred_label'].values
test_set_total

Unnamed: 0,review,label,clean_reviews,pred_label
1267,I switched to google. The amazon helper is mor...,0,switched to google the amazon helper is more f...,0
631,Eh. It’s works on and off. Half the time it do...,0,eh it works on and off half the time it does n...,0
1742,I have been waiting for the Echo Show to go on...,1,have been waiting for the echo show to go on s...,1
1563,It's like having another kid in the house; I h...,0,it like having another kid in the house have t...,0
1517,Love my echo show! Great sound and picture. Do...,1,love my echo show great sound and picture does...,1
...,...,...,...,...
2560,Love the echo dot it’s amaxing!!!,1,love the echo dot it amaxing,1
2013,I bought this for myself and i didn’t realize ...,0,bought this for myself and didn realize it had...,0
2520,Best purchase this year.,1,best purchase this year,1
1200,Meh,0,meh,0


In [87]:
# Plotting confusion matrix on the predictions

from sklearn.metrics import confusion_matrix, accuracy_score

y_true = test_set_total["label"]
y_pred = test_set_total["pred_label"]

print(confusion_matrix(y_true, y_pred))
print(f"\nAccuracy: {accuracy_score(y_true, y_pred)}")

[[49  0]
 [ 9 42]]

Accuracy: 0.91
