In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import shutil
import glob
import jsonlines
import json
import tiktoken
from collections import defaultdict
from openai import OpenAI
from tqdm import tqdm

In [11]:
# Add a new column to the df dataframe which contains the filename of the lyrics file
system_content = "You are a chatbot that, when prompted with song lyrics, predicts one of the emotions ('Happy', 'Sad', 'Angry', or 'Relaxed')\
    without providing any explanation. Reply with only the emotion name. You do not retain any previous information regarding the lyrics given to you. \
        You specialize in analysing the given song lyrics and predicting the emotion of the song."

In [17]:
encoding = tiktoken.encoding_for_model('gpt-3.5-turbo') 

In [None]:
data_path = 'test_promptV2.jsonl'
# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]
# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]['messages']:
    print(message)

In [19]:
# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [20]:
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

In [21]:
# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
    
print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")

Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 161, 1136
mean / median: 435.1917808219178, 404.0
p5 / p95: 266.2, 639.4

#### Distribution of num_assistant_tokens_per_example:
min / max: 1, 2
mean / median: 1.5068493150684932, 2.0
p5 / p95: 1.0, 2.0

0 examples may be over the 4096 token limit, they will be truncated during fine-tuning


In [100]:
dataset[0]['messages'][2]['content']

'relaxed'

In [24]:
client = OpenAI(api_key="")

In [147]:
# Testing:
models = ['ft:gpt-3.5-turbo-1106:personal::8OVFs1AY',
          'ft:gpt-3.5-turbo-1106:personal::8Ph6V5tg',
          'ft:gpt-3.5-turbo-1106:personal::8QMX2BHf']
fine_tuned_model = models[2]
# Get new_prompt as test[0]['messages'][1]['content']
output = []
for i in range(len(dataset)):
    new_prompt = dataset[i]['messages'][1]['content']
    answer = client.chat.completions.create(
      model=fine_tuned_model,
      messages=[
          {"role": "system", "content": "You are a chatbot that when prompted with song lyrics, predict one of the emotions ('Happy', 'Sad', 'Angry', or 'Relaxed') without providing any explanation. Reply with only the emotion name."},
          {"role": "user", "content": new_prompt}
      ]
    )
    print(answer.choices[0].message, dataset[i]['messages'][2]['content'])
    output.append([answer.choices[0].message, dataset[i]['messages'][2]['content']])

ChatCompletionMessage(content='relaxed', role='assistant', function_call=None, tool_calls=None) relaxed
ChatCompletionMessage(content='relaxed', role='assistant', function_call=None, tool_calls=None) happy
ChatCompletionMessage(content='angry', role='assistant', function_call=None, tool_calls=None) angry
ChatCompletionMessage(content='sad', role='assistant', function_call=None, tool_calls=None) sad
ChatCompletionMessage(content='angry', role='assistant', function_call=None, tool_calls=None) angry
ChatCompletionMessage(content='relaxed', role='assistant', function_call=None, tool_calls=None) relaxed
ChatCompletionMessage(content='happy', role='assistant', function_call=None, tool_calls=None) happy
ChatCompletionMessage(content='happy', role='assistant', function_call=None, tool_calls=None) happy
ChatCompletionMessage(content='angry', role='assistant', function_call=None, tool_calls=None) angry
ChatCompletionMessage(content='sad', role='assistant', function_call=None, tool_calls=None) an

In [148]:
# Print the 3rd laat line of the output
print(output[-1])
# Print its corresponding line in the dataset
dataset[-1]

[ChatCompletionMessage(content='angry', role='assistant', function_call=None, tool_calls=None), 'angry']


{'messages': [{'role': 'system',
   'content': "You are a chatbot that, when prompted with song lyrics, predicts one of the emotions ('Happy', 'Sad', 'Angry', or 'Relaxed')    without providing any explanation. Reply with only the emotion name. You do not retain any previous information regarding the lyrics given to you.         You specialize in analysing the given song lyrics and predicting the emotion of the song."},
  {'role': 'user',
   'content': "[Instrumental Intro]\n\n[Verse 1]\nGenerals gathered in their masses\nJust like witches at black masses\nEvil minds that plot destruction\nSorcerers of death's construction\nIn the fields, the bodies burning\nAs the war machine keeps turning\nDeath and hatred to mankind\nPoisoning their brainwashed minds\nOh, Lord, yeah\n\n[Bridge]\nPoliticians hide themselves away\nThey only started the war\nWhy should they go out to fight?\nThey leave that all to the poor, yeah\nTime will tell on their power minds\nMaking war just for fun\nTreating pe

In [None]:
# I want to see which emotions are being predicted the most as compared to the actual emotion

# Create a dataframe with the output
df = pd.DataFrame(output, columns=['predicted', 'actual'])
# Iterate through predicted column and update the row with [i][0].content
for i in range(len(df)):
    df['predicted'][i] = df['predicted'][i].content
df

In [None]:
# Get number of values in each mood category actual
df['actual'].value_counts()

In [143]:
# NOw for each emotion in actual column, create a dictionary with 4 emotions and count the number of times each emotion is predicted
# Create a dictionary with 4 emotions
actual_happy = {'happy': 0, 'sad': 0, 'angry': 0, 'relaxed': 0}
actual_sad = {'happy': 0, 'sad': 0, 'angry': 0, 'relaxed': 0}
actual_angry = {'happy': 0, 'sad': 0, 'angry': 0, 'relaxed': 0}
actual_relaxed = {'happy': 0, 'sad': 0, 'angry': 0, 'relaxed': 0}

for i in range(len(df)):
    if df['actual'][i] == 'happy':
        actual_happy[df['predicted'][i]] += 1
    elif df['actual'][i] == 'sad':
        actual_sad[df['predicted'][i]] += 1
    elif df['actual'][i] == 'angry':
        actual_angry[df['predicted'][i]] += 1
    elif df['actual'][i] == 'relaxed':
        actual_relaxed[df['predicted'][i]] += 1

# Print eavh dictionary
print("Happy:", actual_happy)
print("Sad:", actual_sad)
print("Angry:", actual_angry)
print("Relaxed:", actual_relaxed)

Happy: {'happy': 13, 'sad': 1, 'angry': 2, 'relaxed': 8}
Sad: {'happy': 0, 'sad': 6, 'angry': 3, 'relaxed': 3}
Angry: {'happy': 1, 'sad': 3, 'angry': 14, 'relaxed': 1}
Relaxed: {'happy': 4, 'sad': 2, 'angry': 0, 'relaxed': 12}


In [149]:
# For each element in the array, check if output[i][0].content and output[i][1] are same to get accuracy
count = 0
for i in range(len(output)):
    # print(output[i][0].content, output[i][1])
    if output[i][0].content == output[i][1]:
        count += 1
    # IF not same, print both
    # else:
        # print(output[i][0].content, output[i][1])
print("Accuracy:", count/len(output))

# Calculate F1 score
from sklearn.metrics import f1_score
y_true = []
y_pred = []
for i in range(len(output)):
    y_true.append(output[i][1])
    y_pred.append(output[i][0].content)
f1_score(y_true, y_pred, average='weighted')

# Calculate precision and recall
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
precision_score(y_true, y_pred, average='weighted', zero_division=1)
recall_score(y_true, y_pred, average='weighted', zero_division=1)
print("Prediction and Recall scores: ", precision_score(y_true, y_pred, average='weighted',zero_division=1), recall_score(y_true, y_pred, average='weighted',zero_division=0))

Accuracy: 0.6438356164383562
Prediction and Recall scores:  0.6616987914752874 0.6438356164383562


In [None]:
# Get confusion matrix
print(len(y_true), len(y_pred))
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true, y_pred, labels=['happy', 'sad', 'angry', 'relaxed'])

In [151]:
# Get classification report
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred, labels=['happy', 'sad', 'angry', 'relaxed']))

              precision    recall  f1-score   support

       happy       0.79      0.62      0.70        24
         sad       0.46      0.50      0.48        12
       angry       0.74      0.74      0.74        19
     relaxed       0.55      0.67      0.60        18

    accuracy                           0.64        73
   macro avg       0.63      0.63      0.63        73
weighted avg       0.66      0.64      0.65        73

