In [4]:
from IPython.display import clear_output
!pip install --upgrade tiktoken
!pip install langchain
!pip install openai
!pip install gdown==v4.6.3
!gdown 15TxV3umM2xQ0tSJMezc3vZO9UuS9Q-tC
clear_output()

OPENAI_API_TOKEN = ""
MOUNT_GDRIVE = False

import os
from google.colab import drive
import openai
from langchain.chat_models import ChatOpenAI
import tiktoken
from langchain.prompts import ChatPromptTemplate
import pandas as pd

df = pd.read_csv('1.2 clean_data.csv', encoding='utf-8', index_col=0)


In [5]:
len(df)

25419

In [11]:
emotions = ["anger", "disgust", "fear", "joy", "neutral", "sadness", "surprise"]
emotion_list = str(emotions)

In [7]:
system_template = "You classify the emotions of this sentence into one of the following {emotion_list}. You must answer with a single word from the prior list. If you are not sure, return neutral."
human_template = "{dialogue}"

In [8]:
llm = ChatOpenAI(temperature=0.1, model = "gpt-3.5-turbo-1106")
llm.model_name

In [9]:
chat_prompt = ChatPromptTemplate.from_messages([
    ("system", system_template),
    ("human", human_template),
])

In [21]:
len(df)

25120

In [10]:
# Estimated OpenAI cost, USD, using tiktoken

encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
count_tokens_human = df["dialog"].apply(lambda n: len(encoding.encode(n))).sum()
count_tokens_system = len(encoding.encode(system_template)) * len(df)
count_tokens_total = count_tokens_human + count_tokens_system
price_per_1k_tokens_usd = 0.001

approx_cost_human = count_tokens_human * (price_per_1k_tokens_usd / 1000)
approx_cost_system = count_tokens_system * (price_per_1k_tokens_usd / 1000)
approx_cost_total = count_tokens_total * (price_per_1k_tokens_usd / 1000)

pct_approx_cost_human = 100 * count_tokens_human / count_tokens_total
pct_approx_cost_system = 100 * count_tokens_system / count_tokens_total

print(f"Approximate Cost, Dialogue = \t$ {round(approx_cost_human,2)} \t({round(pct_approx_cost_human, 1)} %)")
print(f"Approximate Cost, Prompting = \t$ {round(approx_cost_system,2)} \t({round(pct_approx_cost_system, 1)} %)")
print("---")
print(f"Approximate Cost, Total = \t$ {round(approx_cost_total,2)}")

Approximate Cost, Dialogue = 	$ 0.45 	(32.1 %)
Approximate Cost, Prompting = 	$ 0.94 	(67.9 %)
---
Approximate Cost, Total = 	$ 1.39


In [14]:
df["emotion_chatgpt"] = ""

# OpenAI loop, consumes tokens from your OpenAI subscription
if OPENAI_API_TOKEN:
  for i, row in df.iterrows():
      result = llm(chat_prompt.format_messages(emotion_list=emotion_list, dialogue=row["dialog"]))
      row["emotion_chatgpt"] = result.content.lower()
      if
  file_name_cgpt_labelled = "cgpt_labelled.csv" # save file
  df.to_csv(file_name_cgpt_labelled)
  if MOUNT_GDRIVE:
    drive.mount('/content/gdrive')
    df.to_csv('/content/drive/' + file_name_cgpt_labelled)

# When the loop has already ran once, just download the previously saved result
if not OPENAI_API_TOKEN:
  !gdown 1s6rRXPvci4ITwfGGFnCIQHdHNqDw70oR
  clear_output()
  df = pd.read_csv('2.1 - cgpt_seasons_labelled.csv', index_col=0)

In [None]:
df.head()

In [16]:
df["emotion_chatgpt"].value_counts()

joy                 9798
neutral             4887
surprise            4585
fear                1961
anger               1942
sadness             1349
disgust              598
frustration           64
confusion             50
pain                  25
regret                21
concern               20
hope                  13
doubt                 12
jealous                9
disappointment         9
uncertainty            8
embarrassment          8
apology                6
impatience             6
relief                 6
jealousy               5
confused               3
curiosity              3
anxiety                2
gratitude              2
excitement             2
bored                  2
encouragement          2
envy                   2
self-deprecating       2
calm                   2
betrayal               1
determination          1
annoyance              1
appreciation           1
embarrassed            1
urgency                1
anxious                1
reassurance            1


In [22]:
len(df)

25120

In [20]:
# cut tail of unwanted emotions
df = df[df["emotion_chatgpt"].isin(emotions)]
df.to_csv("2.2 - cgpt_labelled_trim.csv")