In [1]:
# !pip install accelerate
# !pip install -i https://pypi.org/simple/ bitsandbytes

# !pip install llama-index-embeddings-huggingface

# !pip install llama-index

# !pip install llama-index-llms-huggingface

In [1]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core import set_global_tokenizer
from transformers import AutoTokenizer
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

In [2]:
import torch
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import PromptTemplate




In [3]:
LLAMA2_7B_CHAT = "meta-llama/Llama-2-7b-chat-hf"

In [4]:
selected_model = LLAMA2_7B_CHAT

In [None]:
SYSTEM_PROMPT = """You are an AI assistant that answers questions in a friendly manner, based on the given source documents. Here are some rules you always follow:
- Generate human readable output, avoid creating output with gibberish text.
- Generate only the requested output, don't include any other language before or after the requested output.
- Never say thank you, that you are happy to help, that you are an AI agent, etc. Just answer directly.
- Generate professional language typically used in business documents in North America.
- Never generate offensive or foul language.
"""

query_wrapper_prompt = PromptTemplate(
    "[INST]<<SYS>>\n" + SYSTEM_PROMPT + "<</SYS>>\n\n{query_str}[/INST] "
)

llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=2048,
    generate_kwargs={"temperature": 0.1, "do_sample": True},
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name=selected_model,
    model_name=selected_model,
    device_map="auto",
    # change these settings below depending on your GPU
    model_kwargs={"torch_dtype": torch.float16, "load_in_8bit": True},
)

In [None]:
import time

In [None]:
start = time.time()

print(llm.complete("In Python, make a function that prints all even numbers between 0 and 10 inclusive.").text)

print(f"Time elapsed: {time.time() - start}")

In [None]:
set_global_tokenizer(
    AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf").encode
)

In [None]:
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [None]:
document = SimpleDirectoryReader(input_files=["essay-data/" + 'essay.txt']).load_data()
index = VectorStoreIndex.from_documents(document, embed_model=embed_model)
query_engine = index.as_query_engine(llm=llm)

In [12]:
start = time.time()
r = query_engine.query("What did the author do growing up?")
# r.print_response_stream()
print(r.response)
print()
print(time.time() - start)

Based on the given context, the author grew up writing short stories and programming on the IBM 1401 computer in 9th grade. Later, he got his first microcomputer, a TRS-80, in about 1980, which he used to write simple games, a program to predict how high model rockets would fly, and a word processor that his father used to write at least one book. The author also studied philosophy in college but found it boring, so he switched to AI and spent 4 years writing a new Lisp, called Bel, in itself in Arc.

19.522247791290283


In [13]:
import requests
import json
from bs4 import BeautifulSoup
import os

In [14]:
API_TOKEN = "rxfMXxfsdsKGb8urL8shecBbBp37ZBuvU5zo39sm"
API_ENDPOINT = f"https://api.marketaux.com/v1/news/all"

payload = {
    "api_token": API_TOKEN,
    "entity_types": "equity,index,etf", # should include industries parameter
    "industries": "Technology",
    "countries": "us",
    "published_after": "2024-05-10T15:00",
    "domains": "finance.yahoo.com"
}

In [15]:
payload = {
    "api_token": API_TOKEN,
    "entity_types": "equity,index,etf", # should include industries parameter
#     "industries": "Technology",
    "countries": "us",
    "published_after": "2024-05-10T13:00",
    "published_before": "2024-05-10T14:00",
    "domains": "finance.yahoo.com"
}

In [16]:
def get_urls(payload):
    r = requests.get("https://api.marketaux.com/v1/news/all", params=payload)
    output = r.text
    json_output = json.loads(output)
    data = json_output['data']
    
    urls = []
    
    for i in range(len(data)):
        url = data[i]['url']
        if url.startswith("https://finance.yahoo.com/news"):
            urls.append(url)
        
    return urls

In [17]:
def get_article_content(url):
    r = requests.get(url)
    
    if r.status_code != 200:
        return 'Invalid Response'
    
    soup = BeautifulSoup(r.content)
    body = soup.find('div', attrs={'class': 'caas-body'})
    content = ""
    
    for text in body.find_all('p'):
        content += text.text
    
    return content

In [18]:
def write_to_file(filename, content):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(content)

In [20]:
def create_documents():
    payload = {
        "api_token": API_TOKEN,
        "entity_types": "equity,index,etf", # should include industries parameter
    #     "industries": "Technology",
        "countries": "us",
#         "published_after": "2024-05-10T13:00",
#         "published_before": "2024-05-10T14:00",
        "domains": "finance.yahoo.com"
    }
    
    for add_day in range(5):
        for add_hour in range(7):
            start = "2024-05-" + str(15 + add_day) + "T" + str(13 + add_hour) + ":00"
            finish = "2024-05-" + str(15 + add_day) + "T" + str(13 + add_hour + 1) + ":00"
            payload["published_after"] = start
            payload["published_before"] = finish
            
            urls = get_urls(payload)
            for i in range(len(urls)):
                print(urls[i])
                content = get_article_content(urls[i])
                if content != "Invalid Response": 
                    filename = os.path.join("articles", "article" + str(add_day) + str(add_hour) + str(i) + "_.txt")
                    write_to_file(filename, content)
                else:
                    print("Above URL Skipped")

In [19]:
# create_documents()

In [21]:
risk_options = ["very_risk_averse", "risk_averse", "risk_tolerant", "very_risk_tolerant"]
industry_options = ["Energy", "Materials", "Industrials", "Utilities", "Healthcare", "Financials", "Consumer Discretionary",
                    "Consumer Staples", "Information Technology", "Communication Services", "Real Estate"]
return_goals = []
user_prefs = {"risk": None, "industry": None, "return_goals": None}

In [22]:
yf_documents = SimpleDirectoryReader(input_dir="articles").load_data()
yf_index = VectorStoreIndex.from_documents(yf_documents, embed_model=embed_model)
yf_query_engine = yf_index.as_query_engine(llm=llm, streaming=True)

In [21]:
# stocks = yf_query_engine.query("Based on the ingested articles, return the best stocks to invest in and give reasons why.")
# stocks.print_response_stream()

In [22]:
# response_2 = yf_query_engine.query("Based on the ingested articles, return the 5 best stocks to invest in and give reasons why.")
# response_2.print_response_stream()

In [23]:
# response_3 = yf_query_engine.query("Based on the ingested articles, return a set of stocks that would best diversify a portfolio with the best returns.")
# response_3.print_response_stream()

In [None]:
# only tech stocks except for one

In [23]:
new_prompt = "Return the financial securities that are most mentioned across the articles in a positive sentiment." + \
"The securities returned can be stocks or ETFs. Ensure that the securities returned are of different industries " + \
"that can be put together to make a diversified portfolio. List the stock and the industry associated with it" + \
"and double check that a majority of the industries aren't the same."

response_3 = yf_query_engine.query(new_prompt)
response_3.print_response_stream()

Based on the provided context information, the following financial securities are most mentioned in a positive sentiment across the articles:

1. Stock: NVIDIA (NVDA) - Industry: Technology
2. Stock: Advanced Micro Devices (AMD) - Industry: Technology
3. Stock: Intel (INTC) - Industry: Technology
4. ETF: SPDR S&P 500 ETF Trust (SPY) - Industry: Finance
5. Stock: Micron Technology (MU) - Industry: Technology
6. Stock: Qualcomm (QCOM) - Industry: Technology
7. ETF: iShares Semiconductor ETF (SOXX) - Industry: Technology
8. Stock: Taiwan Semiconductor Manufacturing (TSM) - Industry: Technology
9. Stock: Broadcom (AVGO) - Industry: Technology
10. Stock: Intel (INTC) - Industry: Technology

These securities are from different industries and can be put together to make a diversified portfolio. A majority of the industries are not the same, ensuring that the portfolio is well-diversified.

In [None]:
# this prompt only gave tech stocks excpet for one

In [24]:
new_prompt_2 = "Gather the financial securities that are most mentioned across the articles in a positive sentiment." + \
"The securities gathered can be stocks or ETFs. Return a list of 10 securities in which no more than half of the " + \
"securities are from the same industry. Create this list so that they " + \
"can be put together to make a diversified portfolio. List the stock and the industry associated with it" + \
"and double check that a majority of the industries aren't the same."

response_4 = yf_query_engine.query(new_prompt_2)
response_4.print_response_stream()

Based on the provided articles, the following are the financial securities that are most mentioned in a positive sentiment:

1. NVIDIA (NVDA) - Technology
2. Advanced Micro Devices (AMD) - Technology
3. Intel (INTC) - Technology
4. Micron Technology (MU) - Technology
5. Qualcomm (QCOM) - Technology
6. Broadcom (AVGO) - Technology
7. Texas Instruments (TXN) - Technology
8. Vanguard S&P 500 ETF (VOO) - Financials
9. iShares Trust - iShares Semiconductor ETF (SOXX) - Technology
10. Micron Technology (MU) - Technology

Note: The industries listed are based on the classification of the companies in the articles and may not align with their actual industries.

This list includes a mix of technology stocks and ETFs, which are mentioned positively in the articles. No more than half of the securities are from the technology industry, providing a diversified portfolio.

In [None]:
# below prompt seems to provide best portfolio so far

In [25]:
new_prompt_risk_averse = "Gather the financial securities that are most mentioned across the articles in a positive sentiment." + \
"The securities gathered can be stocks or ETFs. Return a list of 12 securities in which no more than half of the " + \
"securities are from the same industry. This list of securities should also be tailored to a very risk averse investor who " + \
"is more focused on stability instead of rapid growth. " + \
"Create this list so that they " + \
"can be put together to make a diversified portfolio. List the stock and the industry associated with it" + \
"and double check that a majority of the industries aren't the same."

response_risk = yf_query_engine.query(new_prompt_risk_averse)
response_risk.print_response_stream()

Based on the provided articles, the following are the financial securities that are most mentioned in a positive sentiment:

1. SPDR S&P 500 ETF Trust (SPY) - Financials
2. Vanguard S&P 500 ETF (VOO) - Financials
3. NextEra Energy, Inc. (NEE) - Energy
4. The Procter & Gamble Company (PG) - Consumer Goods
5. Johnson & Johnson (JNJ) - Healthcare
6. McDonald's Corporation (MCD) - Consumer Goods
7. Cisco Systems, Inc. (CSCO) - Technology
8. Intel Corporation (INTC) - Technology
9. Visa, Inc. (V) - Financials
10. Mastercard Incorporated (MA) - Financials
11. The Home Depot, Inc. (HD) - Consumer Goods
12. 3M Company (MMM) - Industrials

These securities are tailored to a very risk-averse investor who is more focused on stability instead of rapid growth. They are diversified across different industries, with no more than half of the securities coming from the same industry. This list can be used to create a diversified portfolio that provides a balance of stability and growth.

In [32]:
new_prompt_risk_tolerant = "Gather the financial securities that are most mentioned across the articles in a positive sentiment." + \
"The securities gathered can be stocks or ETFs. Return a list of 12 securities in which no more than half of the " + \
"securities are from the same industry. This list of securities should also be tailored to a very risk tolerant investor who " + \
"is more focused on rapid growth instead of stability. " + \
"Create this list so that they " + \
"can be put together to make a diversified portfolio. List the stock and the industry associated with it" + \
"and double check that a majority of the industries aren't the same."

response_no_risk = yf_query_engine.query(new_prompt_risk_tolerant)
response_no_risk.print_response_stream()

Based on the provided articles, the following are the financial securities that are most mentioned in a positive sentiment:

1. SPDR S&P 500 ETF Trust (SPY) - Financials
2. Vanguard S&P 500 ETF (VOO) - Financials
3. NVIDIA Corporation (NVDA) - Technology
4. NextEra Energy, Inc. (NEE) - Energy
5. The Procter & Gamble Company (PG) - Consumer Goods
6. The Home Depot, Inc. (HD) - Consumer Goods
7. Johnson & Johnson (JNJ) - Healthcare
8. McDonald's Corporation (MCD) - Consumer Goods
9. Visa, Inc. (V) - Financials
10. Mastercard Incorporated (MA) - Financials
11. Amazon.com, Inc. (AMZN) - Technology
12. Shopify Inc. (SHOP) - Technology

Note that these securities are not necessarily the most stable or risk-free, but rather have been identified as having the most positive sentiment in the provided articles. It is important to conduct thorough research and consider other factors before making investment decisions. Additionally, it is important to diversify a portfolio to manage risk, and it is

In [56]:
prompts_dict = yf_query_engine.get_prompts()
print(list(prompts_dict.keys()))

['response_synthesizer:text_qa_template', 'response_synthesizer:refine_template']


In [57]:
prompts_dict['response_synthesizer:text_qa_template']

SelectorPromptTemplate(metadata={'prompt_type': <PromptType.QUESTION_ANSWER: 'text_qa'>}, template_vars=['context_str', 'query_str'], kwargs={}, output_parser=None, template_var_mappings={}, function_mappings={}, default_template=PromptTemplate(metadata={'prompt_type': <PromptType.QUESTION_ANSWER: 'text_qa'>}, template_vars=['context_str', 'query_str'], kwargs={}, output_parser=None, template_var_mappings=None, function_mappings=None, template='Context information is below.\n---------------------\n{context_str}\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: {query_str}\nAnswer: '), conditionals=[(<function is_chat_model at 0x7f94df2645e0>, ChatPromptTemplate(metadata={'prompt_type': <PromptType.CUSTOM: 'custom'>}, template_vars=['context_str', 'query_str'], kwargs={}, output_parser=None, template_var_mappings=None, function_mappings=None, message_templates=[ChatMessage(role=<MessageRole.SYSTEM: 'system'>, content="You are an exp

In [63]:
import re

pattern = r'\(([^)]+)\)'
matches = re.findall(pattern, response_risk.response_txt)

In [65]:
for match in matches:
    print(match) # regex needs to be alphabetical and no commas

SPY
VOO
NEE
PG
JNJ
MCD
KO
V
MA
INTC
MMM
UNH
SPY, VOO, INTC
NEE
MCD, JNJ, KO, PG
INTC
MMM, UNH
3
1
4
1
2


In [None]:
from llama_index.core import PromptTemplate

text_qa_template_str = (
    "Context information is"
    " below.\n---------------------\n{context_str}\n---------------------\nUsing"
    " both the context information and also using your own knowledge, answer"
    " the question: {query_str}\na.\n"
)
text_qa_template = PromptTemplate(text_qa_template_str)

refine_template_str = (
    "The original question is as follows: {query_str}\nWe have provided an"
    " existing answer: {existing_answer}\nWe have the opportunity to refine"
    " the existing answer (only if needed) with some more context"
    " below.\n------------\n{context_msg}\n------------\nUsing both the new"
    " context and your own knowledge, update or repeat the existing answer.\n"
)
refine_template = PromptTemplate(refine_template_str)

In [47]:
text_qa_template_str = (
    "Context information is"
    " below.\n---------------------\n{context_str}\n---------------------\nUsing"
    " both the context information and also using your own knowledge, answer"
    " the request: {query_str}\nIf the context isn't helpful, you can also"
    " answer the request on your own.\nWhen answering questions about portfolio recommendations,"
    " respond in the following format:\nSTOCK NAME (TICKER) - INDUSTRY\n and"
    " follow that format for each stock. The words in all capitals should be replaced with"
    " the actual stock name, ticker, and industry."
)
text_qa_template = PromptTemplate(text_qa_template_str)

In [48]:
# should test adding system prompt and incorporating user preferences like risk aversion and industry preferences

In [49]:
yf_query_engine_prompt = yf_index.as_query_engine(llm=llm, streaming=True, text_qa_template=text_qa_template)

In [None]:
response_risk_prompt = yf_query_engine.query(new_prompt_risk_averse)
response_risk_prompt.print_response_stream()

In [40]:
from prophet_model_forecast import gen_stock_forecast_original

ModuleNotFoundError: No module named 'prophet_model_forecast'

In [41]:
from dual_dcf_model import discounted_cash_flow

In [39]:
from dual_dcf_model import portfolio_dist

In [36]:
stock_recs, stock_weights = portfolio_dist(['X', 'AAPL', 'META', 'MA', 'BA'], 10000, "moderate")

X Unavailable Cash Flow: AccountsPayableCurrent
X Strong Sell 0
AAPL Strong Buy 5
META Strong Buy 5
MA Sell 0
BA Strong Buy 3


In [38]:
stock_recs, stock_weights

({'X': 'Strong Sell',
  'AAPL': 'Strong Buy',
  'META': 'Strong Buy',
  'MA': 'Sell',
  'BA': 'Strong Buy'},
 {'X': 0.0,
  'AAPL': 3846.153846153846,
  'META': 3846.153846153846,
  'MA': 0.0,
  'BA': 2307.6923076923076})