# DSC 253 Final Project

## Financial Agent RAG 

### INSTALLATING LIBRARIES

In [None]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers trl peft accelerate bitsandbytes
!pip install sec_api
!pip install -U langchain
!pip install -U langchain-community
!pip install -U sentence-transformers
!pip install -U faiss-gpu

### IMPORTS & CREDENTIAL SETUP

HuggingFace Token Found: https://huggingface.co/settings/tokens
Free SEC API Key Here: https://sec-api.io/

In [None]:
hf_token = ""
sec_api_key = ""

import torch
from unsloth import FastLanguageModel, is_bfloat16_supported
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments
from sec_api import ExtractorApi, QueryApi
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter

### MODEL & TOKENIZER INITIALIZATION

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="meta-llama/Meta-Llama-3-8B-Instruct",
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
    token=hf_token,
)

### APPLY LoRA ADAPTERS FOR FINE-TUNING

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
)

### DATA PREPARATION & PROMPT TEMPLATE

In [None]:
ft_prompt = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
Below is a user question, paired with retrieved context. Write a response that answers the question, with specific details. <|eot_id|>

<|start_header_id|>user<|end_header_id|>
### Question:
{}
### Context:
{}

<|eot_id|>

### Response: <|start_header_id|>assistant<|end_header_id|>
{}"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    q = examples["question"]
    c = examples["context"]
    a = examples["answer"]
    texts = []
    for question, context, response in zip(q, c, a):
        text = ft_prompt.format(question, context, response) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

dataset = load_dataset("virattt/llama-3-8b-financialQA", split="train")
dataset = dataset.map(formatting_prompts_func, batched=True)

### TRAINING SETUP

In [None]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=2048,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
    ),
)

# Uncomment to train:
# trainer_stats = trainer.train()
# You may require a weights and biases api key
# Uncomment to save the trained model and change it to your directory:
# model.save_pretrained("/content/drive/MyDrive")
# tokenizer.save_pretrained("/content/drive/MyDrive")

### INFERENCE FUNCTIONS

In [None]:
def inference(question, context):
    inputs = tokenizer([ft_prompt.format(question, context, "")], return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=64,
        use_cache=True,
        pad_token_id=tokenizer.eos_token_id
    )
    return tokenizer.batch_decode(outputs)

def extract_response(text):
    text = text[0]
    start_token = "### Response: <|start_header_id|>assistant<|end_header_id|>"
    end_token = "<|eot_id|>"
    start_index = text.find(start_token) + len(start_token)
    end_index = text.find(end_token, start_index)
    if start_index == -1 or end_index == -1:
        return None
    return text[start_index:end_index].strip()

### SEC DATA EXTRACTION FUNCTIONS

In [None]:
def get_filings(ticker):
    queryApi = QueryApi(api_key=sec_api_key)
    query = {
      "query": f"ticker:{ticker} AND formType:\"10-K\"",
      "from": "0",
      "size": "1",
      "sort": [{ "filedAt": { "order": "desc" } }]
    }
    filings = queryApi.get_filings(query)
    filing_url = filings["filings"][0]["linkToFilingDetails"]
    extractorApi = ExtractorApi(api_key=sec_api_key)
    onea_text = extractorApi.get_section(filing_url, "1A", "text")
    seven_text = extractorApi.get_section(filing_url, "7", "text")
    return onea_text + "\n\n" + seven_text

### EMBEDDINGS & VECTOR DATABASE SETUP

In [None]:
embeddings = HuggingFaceEmbeddings(
    model_name="BAAI/bge-large-en-v1.5",
    model_kwargs={'device':'cuda'},
    encode_kwargs={'normalize_embeddings': True}
)

ticker = input("Enter ticker symbol (e.g. AAPL): ")
filing_data = get_filings(ticker)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=500,
    length_function=len,
    is_separator_regex=False
)

split_data = text_splitter.create_documents([filing_data])
db = FAISS.from_documents(split_data, embeddings)
retriever = db.as_retriever()

### RETRIEVAL FUNCTION

In [None]:
def retrieve_context(query):
    retrieved_docs = retriever.invoke(query)
    return [doc.page_content for doc in retrieved_docs]

### INTERACTIVE QUERY LOOP

In [None]:
while True:
    user_question = input(f"Ask about {ticker}'s 10-K (type 'x' to exit): ")
    if user_question.lower() == "x":
        break
    context = retrieve_context(user_question)
    resp = inference(user_question, context)
    parsed_resp = extract_response(resp)
    print(f"Answer: {parsed_resp}\n---")

### Reference:

1. https://colab.research.google.com/drive/1eisDW1zTuHgHzoPS8o8AKfPThQ3rWQ2P#scrollTo=cK8V6P9uYFJz 
2. https://docs.unsloth.ai/get-started/unsloth-notebooks