In [1]:
%load_ext jupyter_ai



In [133]:
import os

import argparse
import sys
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from pprint import PrettyPrinter
pp = PrettyPrinter(indent=2)

In [4]:
import logging
logger = logging.getLogger('llama2_results')
logger.setLevel(logging.DEBUG)
consoleHandler = logging.StreamHandler()
consoleHandler.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
consoleHandler.setFormatter(formatter)
logger.addHandler(consoleHandler)

In [6]:
ROOT_DIRECTORY = Path().resolve().parent.parent
logger.info(f"Root directory: {str(ROOT_DIRECTORY)}")
if str(ROOT_DIRECTORY) not in sys.path:
    sys.path.insert(0, str(ROOT_DIRECTORY))

2023-08-10 20:12:59,421 - llama2_results - INFO - Root directory: /Users/glenn/PycharmProjects/zero-shot-finance


In [106]:
task_name = 'sentiment_analysis'
quantizations = ['fp16','bf16','int8','fp4']
model_ids = [f"meta-llama/Llama-2-{n}b-chat-hf" for n in (7, 13, 70)]
model_names = [id.split('/')[-1] for id in model_ids]
hf_auth = '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'

In [91]:
def extract_label(text_output, E_INST = "[/INST]"):
    # Find the 'end of instruction' token and remove text before it
    response_pos = text_output.find(E_INST)
    generated_text = text_output[response_pos + len(E_INST) :].strip()
    # Convert the string to lowercase for case-insensitive search
    text = text_output.lower()
    
    # Define the substring options
    substrings = ["label: positive", "label: negative", "label: neutral"]
    
    # Iterate over the substrings and find the matching label
    for i, substring in enumerate(substrings):
        if substring in text:
            return i
    
    # If none of the substrings are found, return -1
    return -1

In [104]:
def compute_metrics(files, outputs_directory):
    acc_list = []
    f1_list = []
    missing_perc_list = []

    for file in files:
        df = pd.read_csv(outputs_directory / file)

        # Make sure the 'Label:' was provided in all generated text
        if all(df['text_output'].str.contains('Label:')):
            pass
        else:
            raise ValueError("not all responses contain the substring 'Label:'")

        # Decode the predicted label
        df["generated_label"] = df["text_output"].apply(extract_label)

        # Calculate metrics
        acc_list.append(accuracy_score(df["true_label"], df["generated_label"]))
        f1_list.append(
            f1_score(df["true_label"], df["generated_label"], average="weighted")
        )
        missing_perc_list.append(
            (len(df[df["generated_label"] == -1]) / df.shape[0]) * 100.0
        )

    return acc_list, f1_list, missing_perc_list

In [113]:
results = {}
for model_name in model_names:
    results[model_name] = {}
    for quantization in quantizations:
        # Define output directory
        LLM_OUTPUTS_DIRECTORY = (
            ROOT_DIRECTORY
            / "data"
            / task_name
            / "llm_prompt_outputs"
            / quantization
        )
        # Filter out relevant files
        files = [
            f.stem
            for f in LLM_OUTPUTS_DIRECTORY.iterdir()
            if model_name in f.name and f.suffix == ".csv"
        ]
        results[model_name][quantization] = files

In [134]:
pp.pprint(results)

{ 'Llama-2-13b-chat-hf': { 'bf16': [ 'Llama-2-13b-chat-hf_944601_10_08_2023_41',
                                     'Llama-2-13b-chat-hf_78516_10_08_2023_41',
                                     'Llama-2-13b-chat-hf_5768_10_08_2023_41'],
                           'fp16': [ 'Llama-2-13b-chat-hf_78516_10_08_2023_72',
                                     'Llama-2-13b-chat-hf_5768_10_08_2023_70',
                                     'Llama-2-13b-chat-hf_944601_10_08_2023_71'],
                           'fp4': [ 'Llama-2-13b-chat-hf_5768_10_08_2023_37',
                                    'Llama-2-13b-chat-hf_78516_10_08_2023_37',
                                    'Llama-2-13b-chat-hf_944601_10_08_2023_36'],
                           'int8': [ 'Llama-2-13b-chat-hf_5768_10_08_2023_118',
                                     'Llama-2-13b-chat-hf_78516_10_08_2023_121']},
  'Llama-2-70b-chat-hf': { 'bf16': [],
                           'fp16': [],
                           'fp4': [ 'Ll

In [139]:
for model_name in model_names:
    for quantization in quantizations:
        files = results[model_name][quantization]
        if not files:
            print('EMPTY', model_name, quantization)
        elif len(files) == 3:
            print('DONE', model_name, quantization)
        elif len(files) < 3:
            print('WIP', model_name, quantization)
        else:
            print('ERROR!', model_name, quantization)

DONE Llama-2-7b-chat-hf fp16
DONE Llama-2-7b-chat-hf bf16
DONE Llama-2-7b-chat-hf int8
DONE Llama-2-7b-chat-hf fp4
DONE Llama-2-13b-chat-hf fp16
DONE Llama-2-13b-chat-hf bf16
WIP Llama-2-13b-chat-hf int8
DONE Llama-2-13b-chat-hf fp4
EMPTY Llama-2-70b-chat-hf fp16
EMPTY Llama-2-70b-chat-hf bf16
EMPTY Llama-2-70b-chat-hf int8
DONE Llama-2-70b-chat-hf fp4


In [105]:
acc_list, f1_list, missing_perc_list = compute_metrics(files, LLM_OUTPUTS_DIRECTORY)

# Print results
print("f1 score mean: ", format(np.mean(f1_list), ".4f"))
print("f1 score std: ", format(np.std(f1_list), ".4f"))
print(
    "Percentage of cases when didn't follow instruction: ",
    format(np.mean(missing_perc_list), ".4f"),
    "\n",
)

f1 score mean:  0.8818
f1 score std:  0.0151
Percentage of cases when didn't follow instruction:  0.0000 

