In [8]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains.question_answering import load_qa_chain
import json
import datetime
import shutil
from openai import OpenAI
import pandas as pd
import requests
from sklearn.metrics.pairwise import cosine_similarity


### Functions to create knowledge hub (You do not need to edit this)

In [9]:
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

In [10]:
embeddings = OpenAIEmbeddings(openai_api_key= OPENAI_API_KEY)

In [11]:
def create_knowledge_hub(path_to_10k):
    """From a 10-K document, create a Chroma DB knowledge hub.

    Args:
        path_to_10k: Relative path to the 10-K hosted locally on the user's computer

    Returns:
        vectordb: The vector database with the information from the 10-K
        db_directory: The path to the vector database
    """

    now = datetime.datetime.now()
    timestamp = now.strftime("%Y%m%d%H%M%S")
    db_directory = "db_" + timestamp

    loader = PyPDFLoader(path_to_10k)
    documents = loader.load()

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, 
        chunk_overlap=5,
        separators=["\n\n", "\n", " ", ""],
        length_function=len)
    texts = splitter.split_documents(documents)

    vectordb = Chroma.from_documents(
        documents=texts, 
        embedding=embeddings,
        persist_directory=db_directory
    )
    vectordb.persist()

    return vectordb, db_directory

In [12]:
def delete_chroma_db(db_directory):
    """Deletes the Chroma DB created locally on the computer

    Args:
        db_directory: The path to the vector database
    """
    try:
        shutil.rmtree(db_directory)
    except FileNotFoundError:
        print(f"Chroma database '{db_directory}' not found.")
    except Exception as e:
        print(f"Error deleting Chroma database: {str(e)}")

In [4]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def compare_strings(text1, text2):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([text1, text2])
    # Calculate the cosine similarity between the vectors
    similarity = cosine_similarity(vectors)
    return similarity[0][1]

### Function to query model. Change this part with your model

In [5]:
def query_model(path_to_10k, question):
    """Ask the fine-tuned GPT model a question based off a local 10-K document.

    Args:
        path_to_10k: Relative path to the 10-K hosted locally on the user's computer
        question: Question to ask the model

    Returns:
        answer: The answer given by the fine-tuned GPT model
    """

    db, db_dir = create_knowledge_hub(path_to_10k)

    source1 = db.similarity_search(question, k = 2)[0].page_content
    source2 = db.similarity_search(question, k = 2)[1].page_content

    ## EDIT THIS PART
    client = OpenAI()

    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a factual chatbot that answers questions about 10-K documents. You only answer with answers you find in the text, no outside information."},
            {"role": "user", "content": f"{source1}{source2} Now, this is our question: {question}"}
        ]
    )
    ## END OF EDITING

    delete_chroma_db(db_dir)
    
    answer = completion.choices[0].message.content #You might have to edit this

    return answer



## Evaluation of the model. Run this script

In [3]:
path_to_csv_dataset = "Datasets/financebench_sample_150.csv" ##Replace this with the path to your file

In [2]:
filename = "download.pdf"

def download_document(url):
    """Download a PDF based off a URL

    Args:
        url: URL to the document found online

    Returns:
        filename: the filename on your computer
    """

    response = requests.get(url)
    with open(filename, 'wb') as file:
        file.write(response.content)
    return filename

def delete_document(file_path):
    """Delete a document at a given file path.

    Args:
        file_path: The full file path of the document to be deleted.
    """
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"Deleted file: {file_path}")
    else:
        print(f"File not found: {file_path}")

In [19]:
def run_eval(path_to_csv_dataset):
    list_of_cosine_similarity = []

    df = pd.read_csv(path_to_csv_dataset)

    for index, row in df.iterrows():
        question = row['question']
        answer = row['answer']
        doc_link = row['doc_link']

        #download the document from URL given
        download_document(doc_link)

        #query our model
        model_answer = query_model(filename, question)

        #compare similarity
        sim = compare_strings(answer, model_answer)

        print("answers are", answer, model_answer)

        print("sim is", sim)

        #delete the document downloaded
        delete_document(filename)

        #add the similarity to the list
        list_of_cosine_similarity.append(sim)

    #get the average of the similarities
    return sum(list_of_cosine_similarity) / len(list_of_cosine_similarity) 





    

In [20]:
#This should be your final answer for your model's accuracy
print(run_eval(path_to_csv_dataset))

answers are $1,577.00  The FY2018 capital expenditure amount for 3M was not provided in the given information.
sim is 0.0
Deleted file: download.pdf
answers are $8.70  Based on the information provided, the net PP&E (Property, Plant, and Equipment) for 3M at the year end of FY2018 was not explicitly stated in the text. Therefore, I cannot provide the specific value in USD billions.
sim is 0.0
Deleted file: download.pdf
0.0
