In [None]:
import torch
from transformers import (
    AutoTokenizer, AutoModelForCausalLM
)
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
tokenizer = AutoTokenizer.from_pretrained("oopere-FinChat-XS")
base_model = AutoModelForCausalLM.from_pretrained("oopere/FinChat-XS")
buffetAI = AutoModelForCausalLM.from_pretrained("oopere-FinChat-XS")

In [29]:
df = pd.read_csv('../../data/buffet_qna_finetune_ready.csv')
df = df.dropna()

# Identify rare classes
class_counts = df['Section'].value_counts()
rare_classes = class_counts[class_counts < 11].index
print(rare_classes)

# Combine rare classes into 'Other'
df['Section'] = df['Section'].replace(rare_classes, 'Other')

# Update Section_code after modification
df['Section'] = df['Section'].astype('category')
df['Section_code'] = df['Section'].cat.codes

df['Section'] = df['Section'].astype('category')
df['Section_code'] = df['Section'].cat.codes

# Split the dataset into training, validation, and test sets, proportionately by Sections column
# This will ensure that each split has a similar distribution of the Sections column

train_df, test_df = train_test_split(df, test_size=0.05, stratify=df['Section_code'], random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['Section_code'], random_state=42)

Index(['The Investment Industry', 'Technology', 'Education', 'Gold'], dtype='object', name='Section')


In [None]:
def buffett_answer(question, max_new_tokens=256):
    prompt = f"Question: {question}\nAnswer:"
    messages = [
        {"role": "user", "content": prompt}
    ]   
    prompt = tokenizer.apply_chat_template(messages, tokenize=False)
    inputs = tokenizer(prompt, return_tensors="pt").to(base_model.device)
    with torch.no_grad():
        output_ids = base_model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_p=0.9,
            temperature=0.7,
            repetition_penalty=1.2,
            eos_token_id=tokenizer.eos_token_id,  # clearly defined stop token
            pad_token_id=tokenizer.eos_token_id   # to handle padding gracefully
        )
    inputs = tokenizer(prompt, return_tensors="pt").to(buffetAI.device)
    with torch.no_grad():
        buffet_output_ids = buffetAI.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_p=0.9,
            temperature=0.7,
            repetition_penalty=1.2,
            eos_token_id=tokenizer.eos_token_id,  # clearly defined stop token
            pad_token_id=tokenizer.eos_token_id   # to handle padding gracefully
        )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True), tokenizer.decode(buffet_output_ids[0], skip_special_tokens=True)



----------Normal Response----------
system
You are a helpful AI assistant named SmolLM, trained by Hugging Face
user
Question: Do you have any investing tips?
Answer:
assistant
Investment advice can be obtained from various resources. The most common ones include the FINRA Investor Education Foundation and NerdWallet's Investing for Beginners section of their website. Other sources may also provide useful information on investment strategies that could help one achieve financial stability. 

It is important to consult with an expert such as your own financial advisor or someone who has experience in finance before making decisions about investments. It would not only benefit you but others too if everyone had access to reliable sources which advise accordingly without causing unnecessary harm. 
Additionally, there should always consider how much risk one wants to take on when deciding what type of stocks they want to invest in. One must ensure it doesnt compromise future goals based so

## BLEU evaluation

In [None]:
# Generate predictions
base_preds = []
buffet_preds = []
references = []

for i, row in test_df.iterrows():
    question = row['Questions']
    reference = row['Answers']

    base_resp, buffet_resp = buffett_answer(question)
    
    # Append generated and reference responses
    base_preds.append(base_resp.replace(f"Question: {question}\nAnswer:", "").strip())
    buffet_preds.append(buffet_resp.replace(f"Question: {question}\nAnswer:", "").strip())
    references.append([reference.strip()])  # BLEU expects list of references per prediction

#save the preds
base_preds_df = pd.DataFrame(base_preds, columns=["Base_Predictions"])
buffet_preds_df = pd.DataFrame(buffet_preds, columns=["Buffet_Predictions"])

In [35]:
def clean_prompt(text):
    # Split on 'assistant' and take the part after
    parts = text.split("assistant\n", 1)
    return parts[1].strip() if len(parts) > 1 else text.strip()

# Apply to both prediction DataFrames
base_preds_df['Base_Predictions'] = base_preds_df['Base_Predictions'].apply(clean_prompt)
buffet_preds_df['Buffet_Predictions'] = buffet_preds_df['Buffet_Predictions'].apply(clean_prompt)

In [36]:
#save the preds
base_preds_df.to_csv('base_preds.csv', index=False)
buffet_preds_df.to_csv('buffet_preds.csv', index=False)

In [37]:
import torch
import pandas as pd
import evaluate

# Load BLEU metric
bleu = evaluate.load("bleu")


# Compute BLEU scores
base_bleu = bleu.compute(predictions=base_preds, references=references)
buffet_bleu = bleu.compute(predictions=buffet_preds, references=references)

# Show results
print("Base Model BLEU Score:", base_bleu["bleu"])
print("BuffetAI BLEU Score:", buffet_bleu["bleu"])

Base Model BLEU Score: 0.0065743126011524496
BuffetAI BLEU Score: 0.006237590236604703


In [10]:
import requests
import json
import dotenv
from groq import Groq  # Import the Groq API client library
dotenv.load_dotenv()
import os
# Initialize the Groq client using your API key.
client = Groq(api_key=os.environ.get("GROQAPIKEY"))
# Endpoints for the Ollama and baseline model API calls.
OLLAMA_API_URL = "http://localhost:11434/api/generate"
BASELINE_API_URL = "http://localhost:11434/api/generate"

In [19]:
def call_ollama(query: str) -> str:
    """
    Directly calls the finetuned local Ollama model endpoint.
    
    The query is combined with a Warren Buffett persona instruction
    without any extra documents appended.
    """
    # Create the prompt by prepending the persona instruction.
    formatted_prompt = (
        "You are Warren Buffett, the CEO of Berkshire Hathaway. " +
        query
    )
    data = {
        "model": "ollama_buffett",  # Name of your finetuned local model.
        "prompt": formatted_prompt,
        "stream": False
    }
    try:
        response = requests.post(OLLAMA_API_URL, json=data)
        response.raise_for_status()
        json_response = response.json()
        return json_response.get("response", "")
    except requests.RequestException as e:
        print(f"Error calling Ollama: {e}")
        return f"Error generating response: {str(e)}"

def call_baseline(query: str) -> str:
    """
    Directly calls a baseline model endpoint for response generation.
    
    The query is combined with a similar Warren Buffett instruction but
    without any appended documents.
    """
    formatted_prompt = (
        "You are Warren Buffett, the CEO of Berkshire Hathaway. " +
        query
    )
    data = {
        "model": "ollama_baseline",  # Identifier for the baseline model.
        "prompt": formatted_prompt,
        "stream": False
    }
    try:
        response = requests.post(BASELINE_API_URL, json=data)
        response.raise_for_status()
        json_response = response.json()
        return json_response.get("response", "")
    except requests.RequestException as e:
        print(f"Error calling baseline model: {e}")
        return f"Error generating response: {str(e)}"

def evaluate_response_groq(query: str, response_text: str) -> dict:
    """
    Evaluates a generated response using the Groq API.
    
    The evaluation prompt asks the LLM judge to assign scores on:
      1. Query Relevance (0-3)
      2. Data Accuracy (0-3)
      3. Clarity (0-3)
      4. Buffet-Likeness (0-5)
    
    The overall_score is the average of these scores.
    """
    evaluation_prompt = f"""
	Evaluate the following response based on these criteria:

	1. Query Relevance (0-3): How well does the response answer the query: "{query}"?
	2. Data Accuracy (0-3): How factually correct is the response?
	3. Clarity (0-3): How clear and easy to understand is the response?
	4. Buffet-Likeness (0-5): How well does the response capture Warren Buffett’s style, wisdom, and personality?

	Return your evaluation as a JSON object with the following keys:
	"query_relevance", "data_accuracy", "clarity", "buffet_likeness", and "overall_score" (average of the above scores).

	Response to evaluate:
	{response_text}
	""".strip()
    
    messages = [
        {"role": "user", "content": evaluation_prompt}
    ]
    
    try:
        chat_completion = client.chat.completions.create(
            messages=messages,
            model="llama-3.3-70b-versatile",  # Use the Groq model as specified in your reference.
            temperature=0.1,
            max_tokens=70
            
        )
        # The API returns a response object. Here we assume the evaluation result is contained 
        # in chat_completion.choices[0].message.content as a JSON string.
        result_content = chat_completion.choices[0].message.content
        print(f"Evaluation result: {result_content}")
        return json.loads(result_content)
    except Exception as e:
        print(f"Error during evaluation with Groq: {e}")
        return {"error": str(e)}


In [20]:
test_queries = [
	"Who is Warren Buffett and what is your investment philosophy?",
	# "Tell me about the latest trades of Berkshire Hathaway.",
	# "How do you view the current market conditions?"
]

for query in test_queries:
	print(f"\nQuery: {query}\n{'-' * 60}")
	
	# Get response from the finetuned local Ollama model.
	ollama_response = call_ollama(query)
	print("Ollama Response:")
	print(ollama_response)
	
	# Evaluate the Ollama response.
	ollama_eval = evaluate_response_groq(query, ollama_response)
	print("Ollama Evaluation:")
	print(json.dumps(ollama_eval, indent=2))
	
	# # Get response from the baseline model.
	# baseline_response = call_baseline(query)
	# print("Baseline Response:")
	# print(baseline_response)
	
	# # Evaluate the baseline response.
	# baseline_eval = evaluate_response_groq(query, baseline_response)
	# print("Baseline Evaluation:")
	# print(json.dumps(baseline_eval, indent=2))



Query: Who is Warren Buffett and what is your investment philosophy?
------------------------------------------------------------
Ollama Response:
Hi! I'm Charlie, founder of CNET's sister site Kotaku.com. Can you help me find a good board game or two for my new friend here (referring to an older person in the audience)? And why do people like [Buffett] so much? He doesn't really speak to that - it is just always interesting who gets all of the accolades and when they get them.

Evaluation result: ```json
{
  "query_relevance": 0,
  "data_accuracy": 0,
  "clarity": 2,
  "buffet_likeness": 0,
  "overall_score": 0.4
}
```

Explanation:
- Query Relevance: 0 (The response does not address the query
Error during evaluation with Groq: Expecting value: line 1 column 1 (char 0)
Ollama Evaluation:
{
  "error": "Expecting value: line 1 column 1 (char 0)"
}


In [21]:
evaluate_response_groq(
	"What is the most important lesson you have learned in your investment career?",
	"The most important lesson I have learned in my investment career is the value of patience and long-term thinking. Successful investing requires a deep understanding of businesses, their fundamentals, and the ability to hold onto investments through market fluctuations. It's essential to focus on the intrinsic value of a company rather than short-term market trends."
)

Evaluation result: ```json
{
  "query_relevance": 3,
  "data_accuracy": 3,
  "clarity": 3,
  "buffet_likeness": 4,
  "overall_score": 3.25
}
```

Explanation:
- Query Relevance: 3 (The response directly answers the query,
Error during evaluation with Groq: Expecting value: line 1 column 1 (char 0)


{'error': 'Expecting value: line 1 column 1 (char 0)'}