In [1]:
import os
import pickle
import time
import yfinance as yf
import pandas as pd
import bs4
import requests
import urllib
import pickle as pkl
import numpy as np

from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import CacheBackedEmbeddings, HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA, LLMChain
from langchain.callbacks import StdOutCallbackHandler
from langchain.document_loaders import DirectoryLoader, TextLoader, CSVLoader, DataFrameLoader
from langchain.prompts import PromptTemplate

from transformers import AutoTokenizer
import transformers
import torch

import sentence_transformers

In [1]:
TOKEN = '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'
CACHE_DIR = '/moto/edu/engi4800/ej2487/.cache'

##### Data Loading 

In [6]:
# news_loader = DirectoryLoader('../Data/Scrapes/', glob="**/*.txt", loader_cls=TextLoader)
# news_data = news_loader.load()

In [7]:
# news_data[1]

In [8]:
# history_loader = DirectoryLoader('../Data/Scrapes/', glob="**/*.csv", loader_cls=CSVLoader)
# history_data = history_loader.load()

In [9]:
# history_data[0]

In [10]:
# data = news_data + history_data

##### Data Splitting

In [11]:
# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size = 1000, # the character length of the chunk
#     chunk_overlap = 100, # the character length of the overlap between chunks
#     length_function = len, # the length function - in this case, character length (aka the python len() fn.)
# )

# split_data = text_splitter.split_documents(data)

In [12]:
# len(split_data)

##### Vector Database

In [3]:
store = LocalFileStore("../Data/Database/cache/")

embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

core_embeddings_model = HuggingFaceEmbeddings(
    model_name=embed_model_id
)

embedder = CacheBackedEmbeddings.from_bytes_store(
    core_embeddings_model, store, namespace=embed_model_id
)

# vector_store = FAISS.from_documents(split_data, embedder)

# vector_store.save_local("../Data/Database/faiss_index")
vector_store = FAISS.load_local("../Data/Database/faiss_index/", embedder)


/usr/bin/nvidia-modprobe: unrecognized option: "-s"

ERROR: Invalid commandline, please run `/usr/bin/nvidia-modprobe --help`
       for usage information.

/usr/bin/nvidia-modprobe: unrecognized option: "-s"

ERROR: Invalid commandline, please run `/usr/bin/nvidia-modprobe --help`
       for usage information.



In [7]:
df = pd.read_excel(
    open('../Data/Portfolio Assets/portfolios.xlsx','rb'), 
    sheet_name='low risk'
)

df.head()

Unnamed: 0,Type,Date,Ticker,Shares Bought,Price per Share,Total Cost
0,Stock,2022-10-05,LULU,151.0,310.049988,46817.548157
1,Stock,2023-08-06,SGEN,48.0,193.649994,9295.199707
2,Stock,2023-05-09,GOOGL,47.0,107.349998,5045.449928
3,Stock,2022-11-08,TEAM,146.0,122.720001,17917.120178
4,Stock,2021-10-03,AMAT,111.0,122.976112,13650.348473


In [24]:
df_loader = DataFrameLoader(df, page_content_column='Ticker')
df_data = df_loader.load()

In [31]:
for x in df_data:
    x['metadata'] = 'portfolio1'

TypeError: 'Document' object does not support item assignment

In [25]:
df_vector_store = FAISS.from_documents(df_data, embedder)

--------------

##### Modeling

In [4]:
def create_pipeline(model_id):
    
    bnb_config = transformers.BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    model_config = transformers.AutoConfig.from_pretrained(
        model_id,
        use_auth_token=TOKEN, 
        cache_dir=CACHE_DIR
    )

    model = transformers.AutoModelForCausalLM.from_pretrained(
        model_id,
        trust_remote_code=True,
        config=model_config,
        quantization_config=bnb_config,
        device_map='auto', 
        use_auth_token = TOKEN, 
        cache_dir = CACHE_DIR
    )

    model.eval()
    
    tokenizer = transformers.AutoTokenizer.from_pretrained(
        model_id,
        use_auth_token=TOKEN, 
        cache_dir=CACHE_DIR
    )
    
    generate_text_pipeline = transformers.pipeline(
        model=model,
        tokenizer=tokenizer,
        task="text-generation",
        return_full_text=True,
        temperature=0.01,
        max_new_tokens=500
    )
    
    return generate_text_pipeline, tokenizer

In [5]:
def save_results(output, title):
    
    if not os.path.exists("../Data/Query Results"):
        os.mkdir("../Data/Query Results")
        
    with open("../Data/Query Results/" + title + '.pkl', 'wb') as f:
        pkl.dump(output, f)
        

In [6]:
def gather_responses(llm, retriever, prompt_template, sample_queries):
    
    results = []
    
    qa = RetrievalQA.from_chain_type(
            llm=llm,
            retriever=retriever,
            chain_type="stuff",
            return_source_documents=True,
            chain_type_kwargs={'prompt': prompt_template}
        )
    
    for query in sample_queries:
        result = qa({"query" : query})
        results.append(result)
        
    return results

In [12]:
# def gather_responses_portfolio(llm, portfolio, retriever, prompt_template, sample_queries):

#     results = []
    
#     qa = RetrievalQA.from_chain_type(
#             llm=llm,
#             retriever=retriever,
#             chain_type="stuff",
#             return_source_documents=True,
#             chain_type_kwargs={'prompt': prompt_template}
#         )
    
#     for query in sample_queries:
#         result = qa({"query" : query})
#         results.append(result)
        
#     return results

In [60]:
# def llm_chain(llm, retriever, df_data, prompt_template, sample_queries):
    
#     query = sample_queries[0]
#     results = []
    
#     docs = retriever.get_relevant_documents(query)
#     docs = docs + df_data
    
#     prompt_template = prompt_template.partial(context='\n'.join([doc.page_content for doc in docs]))
    
# #     qa = RetrievalQA.from_chain_type(
# #             llm=llm,
# #             retriever=retriever,
# #             chain_type="stuff",
# #             return_source_documents=True,
# #             chain_type_kwargs={'prompt': prompt_template}
# #         )

#     llm_chain = LLMChain(llm=llm, prompt=prompt_template)
    
#     return llm_chain.run(question=query)
    
# #     for query in sample_queries:
        
        
        
# #         inputs = [{"context": doc.page_content} for doc in docs]
# #         result = qa.apply(inputs)
# #         results.append(result)
        
# #     return results
    

In [7]:
prompt_template = PromptTemplate(
    template = """
        Assume you are a financial advisor and a portfolio fund manager. You are given a client portfolio comprising of different assets and their respective holdings.
        {tickers}

        Figure out which tickers or companies are present in the portfolio and answer the user query. 
        {context}

        Only use the provided information to answer the query.

        Question: {question}

        Answer:
        """,
    input_variables = ["context", "tickers", "question"]
)

In [8]:
sample_queries = [
    "Summarize Apple's latest news.",
    "Should I invest in Tesla?",
    "Suggest specific investments for my portfolio, and what criteria are considered?",
    "How has Google been performing in the last quarter?",
    "Describe the historical trend of Amazon's stock prices. Should I invest in Amazon?",
    "I'm looking to invest in a low risk mutual fund. Which do you recommend?",
    "Describe the historical trend of Amazon's stock prices. Should I invest in Amazon?"
]

In [9]:
llama2_model_id = "meta-llama/Llama-2-7b-chat-hf"
mistral_model_id = "mistralai/Mistral-7B-Instruct-v0.1"
falcon_model_id = "tiiuae/falcon-7b"

In [10]:
llama2_pipeline,llama2_tokenizer = create_pipeline(llama2_model_id)
llama2 = HuggingFacePipeline(pipeline=llama2_pipeline)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [23]:
# mistral_pipeline,mistral_tokenizer = create_pipeline(mistral_model_id)
# mistral = HuggingFacePipeline(pipeline=mistral_pipeline)

In [24]:
# falcon_pipeline,falcon_tokenizer = create_pipeline(falcon_model_id)
# falcon = HuggingFacePipeline(pipeline=falcon_pipeline)

In [11]:
retriever = vector_store.as_retriever(search_type="mmr",
                                      search_kwargs={'k': 15,
                                                     'lambda_mult': 0.25})

In [12]:
# results = gather_responses(llama2, retriever, prompt_template, sample_queries)

In [13]:
# save_results(results, 'llama2')

In [None]:
prompt_template = prompt_template.partial(portfolio=df_vector_store)
prompt_template

In [None]:
results_with_portfolio = gather_responses_portfolio(llama2,
                                                    df, 
                                                    retriever, 
                                                    prompt_template,
                                                    sample_queries[1:2])

In [66]:
queries = ['what are recent news associated with the companies that are present in my portfolio']
# queries = ['what all tickers does my portfolio have']

In [67]:
res = llm_chain(llama2, retriever, df_data, prompt_template, queries)
res



"\n        LULU: Recently, Lululemon Athletica (LULU) announced a new partnership with a popular fitness app, which is expected to boost the company's sales and increase its market share.\n\n        SGEN: Recently, Seattle Genetics (SGEN) announced positive phase 3 trial results for its cancer drug, which could lead to FDA approval and significant revenue growth.\n\n        GOOGL: Recently, Alphabet (GOOGL) announced a new initiative to expand its cloud computing services, which could lead to increased revenue and market share.\n\n        TEAM: Recently, Team (TEAM) announced a new partnership with a popular sports app, which is expected to boost the company's sales and increase its market share.\n\n        AMAT: Recently, Applied Materials (AMAT) announced a new product line that is expected to significantly increase the company's revenue and market share.\n\n        INTC: Recently, Intel (INTC) announced a new line of processors that are expected to significantly increase the company

In [63]:
print(res)


        Answer: Your portfolio has the following tickers:
        
        LULU
SGEN
GOOGL
TEAM
AMAT
INTC
ASML
REGN
QQQE
SPYG
USXF
IUSG
VUG
VOOG
JMOM
QQQE
MGK
NZUS
IUSG
MGK
IUSG
NZUS
SPYG
USXF
MGK
IUSG
QQQE
VUG
QQQE
JMOM
QQQE
BNB-USD
ETH-USD
XRP-USD
BNB-USD
USDT-USD
BNB-USD
BNB-USD
USDT-USD
XRP-USD
GC=F
GC=F
GC=F

Explanation:

Based on the information provided, your portfolio has the following tickers:

1. LULU
2. SGEN
3. GOOGL
4. TEAM
5. AMAT
6. INTC
7. ASML
8. REGN
9. QQQE
10. SPYG
11. USXF
12. IUSG
13. VUG
14. VOOG
15. JMOM
16. QQQE
17. MGK
18. NZUS
19. IUSG
20. MGK
21. IUSG
22. NZUS
23. SPYG
24. USXF
25. MGK
26. IUSG
27. QQQE
28. VUG
29. QQQE
30. JMOM
31. QQQE
32. BNB-USD
33. ETH-USD
34. XRP-USD
35. BNB-USD
36. USDT-USD
37. BNB-USD
38. BNB-USD


In [None]:
print(results_with_portfolio[0]['result'])

---------------------------
##### RAG Evaluation

Response Length

In [None]:
import matplotlib.pyplot as plt
import numpy as np

queries = sample_queries
model1_responses = [len(entry['result'].split()) for entry in llama2_results]
model2_responses = [len(entry['result'].split()) for entry in mistral_results]
# model3_responses = [len(entry['result'].split()) for entry in falcon_results]


x = np.arange(len(queries))
width = 0.2

color1 = 'lightblue'
color2 = 'lightcoral'

plt.bar(x - width, model1_responses, width, label='Llama2_13B',color = color1)
plt.bar(x, model2_responses, width, label='Mistral_7B', color = color2)
# plt.bar(x + width, model3_responses, width, label='Falcon_7B')


plt.xticks(x, x, rotation=45, ha='right')

plt.ylabel('Response Length')
plt.xlabel('Queries')
plt.title('Response Length')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_answer_source_alignment(results):

    similarities = []
    for i in results:

        generated_answer = i['result']
        sources = i['source_documents']
        source_documents = [doc.page_content for doc in sources]

        stopwords_list = requests.get("https://gist.githubusercontent.com/rg089/35e00abf8941d72d419224cfd5b5925d/raw/12d899b70156fd0041fa9778d657330b024b959c/stopwords.txt").content
        stop_words = set(stopwords_list.decode().splitlines())
        tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words)

        tfidf_source = tfidf_vectorizer.fit_transform(source_documents)
        tfidf_generated = tfidf_vectorizer.transform([generated_answer])

        cosine_similarities = cosine_similarity(tfidf_generated, tfidf_source)
        most_similar_index = cosine_similarities.argmax()

        similarity_score = cosine_similarities[0][most_similar_index]
        similarities.append(similarity_score)
    
    return similarities 
    

In [None]:
import matplotlib.pyplot as plt
import numpy as np

queries = sample_queries
model1_responses = llama2_alignment
model2_responses = mistral_alignment
# model3_responses = [len(entry['result'].split()) for entry in falcon_results]


x = np.arange(len(queries))
width = 0.2



plt.bar(x - width, model1_responses, width, label='Llama2_13B')
plt.bar(x, model2_responses, width, label='Mistral_7B')
# plt.bar(x + width, model3_responses, width, label='Falcon_7B')


plt.xticks(x, x, rotation=45, ha='right')

plt.ylabel('Alignment Score')
plt.xlabel('Queries')
plt.title('Model Answer to Source Alignment')
plt.legend()

plt.tight_layout()
plt.show()


------------------------------------------

#### RAGAS

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
!pip install ragas

In [None]:
#from ragas.metrics import faithfulness, answer_relevancy, context_relevancy, context_recall
from ragas.metrics import faithfulness, answer_relevancy, context_recall

from ragas.langchain import RagasEvaluatorChain
import os

# Set your OpenAI API key as an environment variable
os.environ["OPENAI_API_KEY"] = "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"

# make eval chains
eval_chains = {
    m.name: RagasEvaluatorChain(metric=m)
    #for m in [faithfulness, answer_relevancy, context_relevancy, context_recall]
    for m in [faithfulness, answer_relevancy,context_recall]
}

In [None]:
for name, eval_chain in eval_chains.items():
    score_name = f"{name}_score"
    print(f"{score_name}: {eval_chain(result)[score_name]}")


In [None]:

from ragas import evaluate
from datasets import Dataset
from datasets import Dataset

data = {
    "question": ["What is the capital of France?"],
    "contexts": ["Paris is the capital of France."],
    "answer": ["Paris"],
    "ground_truths": [["Paris"]],
}

dataset = Dataset.from_dict(data)
results = evaluate(dataset)