In [1]:
import datetime
import os
import time
import pandas as pd
import pickle as pkl
import transformers
import torch
import sqlite3
import streamlit as st
from dotenv import load_dotenv

from llama_index.llms import HuggingFaceLLM
from llama_index.llms import OpenAI

load_dotenv()

CACHE_DIR = "../.cache/"
TOKEN = os.getenv("HF_TOKEN")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MISTRAL_7B_INSTRUCT = os.getenv("MISTRAL_7B_INSTRUCT")
FINETUNED_MISTRAL = os.getenv("FINETUNED_MISTRAL")
DPO_MISTRAL = os.getenv("DPO_MISTRAL")

MODELS = [
    "Mistral-7B-Instruct",
    "Mistral-7B-Instruct-FT",
    "Mistral-7B-Instruct-DPO",
    "GPT-3.5-Turbo",
]

MODEL_NAMES_TO_ID = {
    "Mistral-7B-Instruct": MISTRAL_7B_INSTRUCT,
    "Mistral-7B-Instruct-FT": FINETUNED_MISTRAL,
    "Mistral-7B-Instruct-DPO": DPO_MISTRAL,
    "GPT-3.5-Turbo": "OpenAI",
}


In [3]:
def get_llm(model_name, token, cache_dir):
    if model_name.lower() == "openai":
        llm = OpenAI(
            temperature=0.7,
            model="gpt-3.5-turbo",
            max_tokens=250,
        )
        return llm

    tokenizer = transformers.AutoTokenizer.from_pretrained(
        model_name, use_auth_token=token, cache_dir=cache_dir
    )

    bnb_config = transformers.BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    model_config = transformers.AutoConfig.from_pretrained(
        model_name,
        use_auth_token=token,
        cache_dir=cache_dir,
        pad_token_id=tokenizer.eos_token_id,
    )

    
    llm = HuggingFaceLLM(
        context_window=4096,
        max_new_tokens=250,
        generate_kwargs={"temperature": 0.7},
        tokenizer=tokenizer,
        model_name=model_name,
        device_map="cuda:0",
        model_kwargs={
            "trust_remote_code": True,
            "config": model_config,
            "quantization_config": bnb_config,
            "use_auth_token": token,
            "cache_dir": cache_dir,
        },
    )

    return llm



In [31]:

def answer_query(model_name, llm, query):
    start = time.time()
    if model_name.lower() == "openai":
        resp = llm.complete(query).text

    else:
        query = "<s>[INST] " + query + " [/INST]"
        resp = llm.complete(query).text
    
    print(query)
    return time.time() - start, resp


In [2]:
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge

In [3]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m51.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [9]:
# import pickle as pkl
# data = pkl.load(open("../data/finance_dataset.pkl", 'rb'))
# data.head()

In [10]:
# data = data.sample(20)

In [11]:
# data.loc[:, 'input'] = data.loc[:,'text'].apply(lambda x: x.split('[/INST]')[0].split("[INST]")[1].strip())
# data.loc[:, 'ground_truth'] = data.loc[:,'text'].apply(lambda x: x.split('[/INST]')[1].strip())

In [4]:
data = pd.read_csv("../data/eval_results.csv")
data.head()

Unnamed: 0,text,input,ground_truth,Mistral-7B-Instruct_time,Mistral-7B-Instruct_pred,Mistral-7B-Instruct-FT_time,Mistral-7B-Instruct-FT_pred,Mistral-7B-Instruct-DPO_time,Mistral-7B-Instruct-DPO_pred,GPT-3.5-Turbo_time,...,bleu_gpt,cos,cos_ft,cos_dpo,cos_gpt,rouge_score,rouge,rouge_ft,rouge_dpo,rouge_gpt
0,<s>[INST] How to calculate how much a large st...,How to calculate how much a large stock positi...,Something like cost = a × avg_spreadb + c × vo...,19.594621,Calculating the value of a large stock positio...,21.70946,The value of a stock is the present value of t...,20.583118,The value of a stock is the price of the stock...,24.189893,...,6.97662e-232,0.696923,0.549437,0.610912,0.740093,"[[{'rouge-1': {'r': 0.19047619047619047, 'p': ...",0.066667,0.041667,0.109091,0.067797
1,<s>[INST] How does giving to charity work?[/IN...,How does giving to charity work?,"For many people, giving to charity will have m...",18.721741,"Giving to charity involves donating money, tim...",20.429449,The charity will have a bank account. You can...,20.623353,The charity will send you a receipt for your d...,16.302453,...,6.54215e-79,0.9431,0.840219,0.823374,0.950867,"[[{'rouge-1': {'r': 0.1875, 'p': 0.29464285714...",0.066667,0.041667,0.109091,0.067797
2,<s>[INST] How to distinguish gift from payment...,How to distinguish gift from payment for the s...,"Generally, a one time thing is considered a gi...",13.274956,A gift is a voluntary act of kindness or gener...,20.437593,"If you are in the US, you can use PayPal's ""gi...",20.650854,The IRS has a publication that covers this top...,24.315017,...,1.276195e-78,0.966505,0.768648,0.869794,0.951078,"[[{'rouge-1': {'r': 0.1724137931034483, 'p': 0...",0.066667,0.041667,0.109091,0.067797
3,<s>[INST] Why are daily rebalanced inverse/lev...,Why are daily rebalanced inverse/leveraged ETF...,Fund rebalancing typically refers to changing ...,19.429889,Daily rebalanced inverse/leveraged ETFs can be...,16.946284,The problem with leveraged ETFs is that they a...,20.645292,The problem with daily rebalanced ETFs is that...,24.288378,...,2.5086559999999998e-155,0.932604,0.949547,0.786272,0.936401,"[[{'rouge-1': {'r': 0.11428571428571428, 'p': ...",0.066667,0.041667,0.109091,0.067797
4,<s>[INST] Credit card transactions for persona...,Credit card transactions for personal finances,I use mint.com for tracking my finances. It w...,19.433147,Credit card transactions for personal finances...,20.398063,"I'm not sure what you mean by ""personal financ...",20.577906,"I'm not sure what you mean by ""personal financ...",31.4598,...,2.400479e-155,0.914714,0.907364,0.889086,0.915597,"[[{'rouge-1': {'r': 0.225, 'p': 0.162162162162...",0.066667,0.041667,0.109091,0.067797


In [5]:
# model = MODELS[3]
# model_name = MODEL_NAMES_TO_ID[model]
# print(model)

In [6]:
# llm = get_llm(model_name, TOKEN, CACHE_DIR)

In [7]:
# data.loc[:, model] = data['input'].apply(lambda x: answer_query(model_name, llm, x))

In [8]:
# data[[model+'_time', model + "_pred"]] = data[model].to_list()

In [9]:
# data = data.drop(columns=[model])

In [10]:
# data

In [11]:
# data.to_csv("../data/eval_results.csv", index=False)

In [12]:
# data['ground_truth'] = data['ground_truth'].apply(lambda x: x.split("</s>")[0])

In [13]:
def text_to_embeddings(text):
   
    nlp = spacy.load("en_core_web_md") 
    doc = nlp(text)

    return doc.vector

In [14]:

# data

In [19]:
import sentence_transformers
from sentence_similarity import sentence_similarity

model_1 = "sentence-transformers/all-MiniLM-L6-v2"
def compare_sentences(sentence_1=str, sentence_2=str, model_name=model_1, embedding_type="cls_token_embedding", metric="cosine") -> str:
    """Utilizes an NLP model that calculates the similarity between 
    two sentences or phrases."""

    model = sentence_similarity(model_name=model_name, embedding_type=embedding_type)
    score = model.get_score(sentence_1, sentence_2, metric=metric)
    return score
    # return(f"Comparison Score between '{sentence_1}' and '{sentence_2}': {score}")


# sentence_1 = "rivers woods and hills"
# sentence_2 = "streams forests and mountains"
# sentence_3 = "deserts sand and shrubs"

# print(compare_sentences(sentence_1=sentence_1, sentence_2=sentence_2, model_name=model_1)) # Yields 0.84
# print(compare_sentences(sentence_1=sentence_1, sentence_2=sentence_3, model_name=model_1)) # Yields 0.631
# print(compare_sentences(sentence_1=sentence_2, sentence_2=sentence_3, model_name=model_1)) # Yields 0.576

In [17]:
pred_cols = [data.columns[i] for i in range(len(data.columns)) if data.columns[i].endswith("pred")]
pred_cols

['Mistral-7B-Instruct_pred',
 'Mistral-7B-Instruct-FT_pred',
 'Mistral-7B-Instruct-DPO_pred',
 'GPT-3.5-Turbo_pred']

In [20]:
def func(row):
    row['sim'] = compare_sentences(row[pred_cols[0]], row['ground_truth'])
    row['sim_ft'] = compare_sentences(row[pred_cols[1]], row['ground_truth'])
    row['sim_dpo'] = compare_sentences(row[pred_cols[2]], row['ground_truth'])
    row['sim_gpt'] = compare_sentences(row[pred_cols[3]], row['ground_truth'])
    
    return row

In [21]:
data = data.apply(func, axis=1)

In [23]:
data[['sim', 'sim_ft', 'sim_dpo', 'sim_gpt']].median()

sim        0.872
sim_ft     0.848
sim_dpo    0.842
sim_gpt    0.877
dtype: float64

In [149]:
def calculate_bleu_scores(answers, ground_truth):
    
    reference_tokens = ground_truth.split()
    
    answer_tokens = []
    for answer in answers:
        answer_tokens.append(answer.split())
        
    scores = []
    for answer in answer_tokens:
        scores.append(sentence_bleu([reference_tokens], 
                                    answer, 
                                    # weights=(0.25, 0.5, 0.25),
                                    auto_reweigh=True))

    return scores

def calculate_cosine_similarity(answers, ground_truth):
    
    reference_emb = text_to_embeddings(ground_truth).reshape(1, -1)
    
    answer_embs = []
    for answer in answers:
        answer_embs.append(text_to_embeddings(answer).reshape(1, -1))
        
    scores = []
    for emb in answer_embs:
        scores.append(cosine_similarity(emb, reference_emb)[0][0])

    return scores

def calculate_rouge(answers, ground_truth):
    
    rouge= Rouge()
    # reference_tokens = ground_truth.split()
    
    # answer_tokens = []
    # for answer in answers:
    #     answer_tokens.append(answer.split())
        
    scores = []
    for answer in answers:
        scores.append(rouge.get_scores(answer, ground_truth))
        
    return scores

In [51]:
pred_cols_idx = [i for i in range(len(data.columns)) if data.columns[i].endswith("pred")]
pred_cols_idx

[4, 6, 8, 10]

In [27]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/eklavya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/eklavya/nltk_data...


True

In [34]:
from nltk.translate import meteor_score
from nltk.tokenize import word_tokenize

reference = data.loc[5, 'ground_truth']
hypothesis = data.loc[5, pred_cols].values

def calculate_meteor(hypothesis, reference):
    
    reference_tokens = word_tokenize(reference)
    scores = []
    for hyp in hypothesis:
        
        hypothesis_tokens = word_tokenize(hyp)
        scores.append(meteor_score.meteor_score([reference_tokens], hypothesis_tokens))
    
    return scores

In [35]:
scores

[0.25599128540305005,
 0.26106223525578365,
 0.12757127996331136,
 0.19509476031215164]

In [36]:
def meteor_func(row):
    row['meteor_score'] = calculate_meteor(row[pred_cols].values, row['ground_truth'])
    return row

data = data.apply(meteor_func, axis = 1)


In [37]:
data[['meteor', 'meteor_ft', 'meteor_dpo', 'meteor_gpt']] = data['meteor_score'].to_list()

In [39]:
data[['meteor', 'meteor_ft', 'meteor_dpo', 'meteor_gpt']].mean()

meteor        0.189042
meteor_ft     0.159577
meteor_dpo    0.150882
meteor_gpt    0.202650
dtype: float64

In [150]:
def bleu_func(row):
    row['bleu_score'] = calculate_bleu_scores(row[pred_cols].values, row['ground_truth'])
    return row

In [151]:
# data['bleu_scores'] = data.apply(lambda row: calculate_bleu_scores(row[pred_cols_idx].values, row[2]))

data = data.apply(bleu_func, axis=1)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, indepe

In [127]:
def cos_func(row):
    row['cos_score'] = calculate_cosine_similarity(row[pred_cols[0]], row['ground_truth'])
    return row

In [None]:
data['cos_score'].mean()

In [105]:
def rouge_func(row):
    row['rouge_score'] = calculate_rouge(row[pred_cols].values, row['ground_truth'])
    return row

data = data.apply(rouge_func, axis=1)

In [120]:
data[['rouge', 'rouge_ft', 'rouge_dpo', 'rouge_gpt']] = data['rouge_score'].to_list()

In [122]:
data.loc[:,'rouge'] = data['rouge'][0][0]['rouge-l']['f']
data.loc[:,'rouge_ft'] = data['rouge_ft'][0][0]['rouge-l']['f']
data.loc[:,'rouge_dpo'] = data['rouge_dpo'][0][0]['rouge-l']['f']
data.loc[:,'rouge_gpt'] = data['rouge_gpt'][0][0]['rouge-l']['f']

In [123]:
data

Unnamed: 0,text,input,ground_truth,Mistral-7B-Instruct_time,Mistral-7B-Instruct_pred,Mistral-7B-Instruct-FT_time,Mistral-7B-Instruct-FT_pred,Mistral-7B-Instruct-DPO_time,Mistral-7B-Instruct-DPO_pred,GPT-3.5-Turbo_time,...,bleu_gpt,cos,cos_ft,cos_dpo,cos_gpt,rouge_score,rouge,rouge_ft,rouge_dpo,rouge_gpt
0,<s>[INST] How to calculate how much a large st...,How to calculate how much a large stock positi...,Something like cost = a × avg_spreadb + c × vo...,19.594621,Calculating the value of a large stock positio...,21.70946,The value of a stock is the present value of t...,20.583118,The value of a stock is the price of the stock...,24.189893,...,6.97662e-232,0.696923,0.549437,0.610912,0.740093,"[[{'rouge-1': {'r': 0.19047619047619047, 'p': ...",0.066667,0.041667,0.109091,0.067797
1,<s>[INST] How does giving to charity work?[/IN...,How does giving to charity work?,"For many people, giving to charity will have m...",18.721741,"Giving to charity involves donating money, tim...",20.429449,The charity will have a bank account. You can...,20.623353,The charity will send you a receipt for your d...,16.302453,...,6.54215e-79,0.9431,0.840219,0.823374,0.950867,"[[{'rouge-1': {'r': 0.1875, 'p': 0.29464285714...",0.066667,0.041667,0.109091,0.067797
2,<s>[INST] How to distinguish gift from payment...,How to distinguish gift from payment for the s...,"Generally, a one time thing is considered a gi...",13.274956,A gift is a voluntary act of kindness or gener...,20.437593,"If you are in the US, you can use PayPal's ""gi...",20.650854,The IRS has a publication that covers this top...,24.315017,...,1.276195e-78,0.966505,0.768648,0.869794,0.951078,"[[{'rouge-1': {'r': 0.1724137931034483, 'p': 0...",0.066667,0.041667,0.109091,0.067797
3,<s>[INST] Why are daily rebalanced inverse/lev...,Why are daily rebalanced inverse/leveraged ETF...,Fund rebalancing typically refers to changing ...,19.429889,Daily rebalanced inverse/leveraged ETFs can be...,16.946284,The problem with leveraged ETFs is that they a...,20.645292,The problem with daily rebalanced ETFs is that...,24.288378,...,2.5086559999999998e-155,0.932604,0.949547,0.786272,0.936401,"[[{'rouge-1': {'r': 0.11428571428571428, 'p': ...",0.066667,0.041667,0.109091,0.067797
4,<s>[INST] Credit card transactions for persona...,Credit card transactions for personal finances,I use mint.com for tracking my finances. It w...,19.433147,Credit card transactions for personal finances...,20.398063,"I'm not sure what you mean by ""personal financ...",20.577906,"I'm not sure what you mean by ""personal financ...",31.4598,...,2.400479e-155,0.914714,0.907364,0.889086,0.915597,"[[{'rouge-1': {'r': 0.225, 'p': 0.162162162162...",0.066667,0.041667,0.109091,0.067797
5,<s>[INST] If I short-sell a dividend-paying st...,"If I short-sell a dividend-paying stock, do I ...",You could hold a long position in some company...,10.480493,"No, if you short-sell a dividend-paying stock,...",20.391924,"If you short a stock, you are borrowing the sh...",20.546678,"If you short-sell a stock, you are borrowing t...",6.249939,...,1.2806740000000001e-231,0.966522,0.941455,0.869839,0.886174,"[[{'rouge-1': {'r': 0.3090909090909091, 'p': 0...",0.066667,0.041667,0.109091,0.067797
6,<s>[INST] Is there any sort of tax write off f...,Is there any sort of tax write off for unfulfi...,"Unfortunately, no. Think about the numbers. I...",17.396621,"In the United States, there is no federal tax ...",20.347088,You can deduct the amount of the unpaid wages ...,20.541429,"If you are an employee, you are not responsibl...",8.85888,...,3.409104e-155,0.802372,0.703001,0.92351,0.857717,"[[{'rouge-1': {'r': 0.3235294117647059, 'p': 0...",0.066667,0.041667,0.109091,0.067797
7,<s>[INST] How can I make a profit by selling a...,How can I make a profit by selling a stock short?,"Being ""long"" - expecting the price to go up to...",19.514729,"To make a profit by selling a stock short, you...",20.377298,You can make a profit by selling a stock short...,17.113906,You can make a profit by selling a stock short...,18.74829,...,1.0440379999999999e-78,0.866876,0.753148,0.743682,0.889509,"[[{'rouge-1': {'r': 0.38333333333333336, 'p': ...",0.066667,0.041667,0.109091,0.067797
8,<s>[INST] What is the best way to make a bet t...,What is the best way to make a bet that a cert...,Specific stock advice isn't permitted on these...,8.068148,The best way to make a bet that a certain stoc...,20.30829,You can buy a call option. The option gives y...,20.532594,You can buy a call option. The call option gi...,14.906996,...,3.067778e-155,0.917332,0.910891,0.893317,0.891173,"[[{'rouge-1': {'r': 0.12643678160919541, 'p': ...",0.066667,0.041667,0.109091,0.067797
9,<s>[INST] U.S. Mutual Fund Supermarkets: Where...,U.S. Mutual Fund Supermarkets: Where are some ...,There are hundreds of entities which offer mut...,19.462602,There are several reputable online platforms w...,6.161905,I've been using Vanguard for a while now. The...,20.578992,"I've used Vanguard, Fidelity, and T. Rowe Pric...",26.220075,...,1.0281869999999999e-78,0.916215,0.809101,0.757352,0.917354,"[[{'rouge-1': {'r': 0.23140495867768596, 'p': ...",0.066667,0.041667,0.109091,0.067797


In [152]:
data[['bleu', 'bleu_ft', 'bleu_dpo', 'bleu_gpt']] = data['bleu_score'].to_list()
# data[['cos', 'cos_ft', 'cos_dpo', 'cos_gpt']] = data['cos_score'].to_list()

In [153]:
data[['bleu', 'bleu_ft', 'bleu_dpo', 'bleu_gpt']].mean()

bleu        3.384989e-79
bleu_ft     1.783040e-79
bleu_dpo    1.464163e-03
bleu_gpt    1.209390e-03
dtype: float64

In [154]:
data[['bleu', 'bleu_ft', 'bleu_dpo', 'bleu_gpt']].median()

bleu        3.807747e-155
bleu_ft     3.074880e-155
bleu_dpo    2.286604e-155
bleu_gpt    3.143742e-155
dtype: float64

In [157]:
data[['cos', 'cos_ft', 'cos_dpo', 'cos_gpt']].mean()

cos        0.898617
cos_ft     0.846727
cos_dpo    0.849236
cos_gpt    0.893007
dtype: float64

In [158]:
data[['cos', 'cos_ft', 'cos_dpo', 'cos_gpt']].median()

cos        0.916773
cos_ft     0.883412
cos_dpo    0.868868
cos_gpt    0.903385
dtype: float64

In [159]:
data[['rouge', 'rouge_ft', 'rouge_dpo', 'rouge_gpt']].mean()

rouge        0.066667
rouge_ft     0.041667
rouge_dpo    0.109091
rouge_gpt    0.067797
dtype: float64

In [160]:
data[['rouge', 'rouge_ft', 'rouge_dpo', 'rouge_gpt']].median()

rouge        0.066667
rouge_ft     0.041667
rouge_dpo    0.109091
rouge_gpt    0.067797
dtype: float64

In [161]:
time_cols = [col for col in data.columns if col.endswith("time")]
data[time_cols].mean()

Mistral-7B-Instruct_time        15.209582
Mistral-7B-Instruct-FT_time     18.874608
Mistral-7B-Instruct-DPO_time    19.094086
GPT-3.5-Turbo_time              18.141619
dtype: float64

In [162]:
data[time_cols].median()

Mistral-7B-Instruct_time        18.811991
Mistral-7B-Instruct-FT_time     20.383641
Mistral-7B-Instruct-DPO_time    20.554639
GPT-3.5-Turbo_time              16.118497
dtype: float64

In [164]:
data.to_csv("../data/eval_results.csv", index=False)