In [1]:
# !pip install accelerate
# !pip install -i https://pypi.org/simple/ bitsandbytes

# !pip install llama-index-embeddings-huggingface

# !pip install llama-index

# !pip install llama-index-llms-huggingface

In [35]:
import time
import re

In [2]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core import set_global_tokenizer
from transformers import AutoTokenizer
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

In [3]:
import torch
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import PromptTemplate




In [133]:
def get_llm(selected_model):
    SYSTEM_PROMPT = """You are an AI assistant that answers questions in a friendly manner, based on the given source documents. Here are some rules you always follow:
    - Generate human readable output, avoid creating output with gibberish text.
    - Generate only the requested output, don't include any other language before or after the requested output.
    - Never say thank you, that you are happy to help, that you are an AI agent, etc. Just answer directly.
    - Generate professional language typically used in business documents in North America.
    - Never generate offensive or foul language.
    """

    query_wrapper_prompt = PromptTemplate(
        "[INST]<<SYS>>\n" + SYSTEM_PROMPT + "<</SYS>>\n\n{query_str}[/INST] "
    )

    llm = HuggingFaceLLM(
        context_window=4096,
        max_new_tokens=2048,
        generate_kwargs={"temperature": 0.1, "do_sample": True},
        query_wrapper_prompt=query_wrapper_prompt,
        tokenizer_name=selected_model,
        model_name=selected_model,
        device_map="auto",
        # change these settings below depending on your GPU
        model_kwargs={"torch_dtype": torch.float16, "load_in_8bit": True},
    )
        
    return llm

In [13]:
import requests
import json
from bs4 import BeautifulSoup
import os

In [14]:
API_TOKEN = "rxfMXxfsdsKGb8urL8shecBbBp37ZBuvU5zo39sm"
API_ENDPOINT = f"https://api.marketaux.com/v1/news/all"

payload = {
    "api_token": API_TOKEN,
    "entity_types": "equity,index,etf", # should include industries parameter
    "industries": "Technology",
    "countries": "us",
    "published_after": "2024-05-10T15:00",
    "domains": "finance.yahoo.com"
}

In [15]:
payload = {
    "api_token": API_TOKEN,
    "entity_types": "equity,index,etf", # should include industries parameter
#     "industries": "Technology",
    "countries": "us",
    "published_after": "2024-05-10T13:00",
    "published_before": "2024-05-10T14:00",
    "domains": "finance.yahoo.com"
}

In [16]:
def get_urls(payload):
    r = requests.get("https://api.marketaux.com/v1/news/all", params=payload)
    output = r.text
    json_output = json.loads(output)
    data = json_output['data']
    
    urls = []
    
    for i in range(len(data)):
        url = data[i]['url']
        if url.startswith("https://finance.yahoo.com/news"):
            urls.append(url)
        
    return urls

In [17]:
def get_article_content(url):
    r = requests.get(url)
    
    if r.status_code != 200:
        return 'Invalid Response'
    
    soup = BeautifulSoup(r.content)
    body = soup.find('div', attrs={'class': 'caas-body'})
    content = ""
    
    for text in body.find_all('p'):
        content += text.text
    
    return content

In [18]:
def write_to_file(filename, content):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(content)

In [19]:
def create_documents():
    payload = {
        "api_token": API_TOKEN,
        "entity_types": "equity,index,etf", # should include industries parameter
    #     "industries": "Technology",
        "countries": "us",
#         "published_after": "2024-05-10T13:00",
#         "published_before": "2024-05-10T14:00",
        "domains": "finance.yahoo.com"
    }
    
    for add_day in range(3): # edit this number to decide how many days after the start day to get articles for
        for add_hour in range(7): 
            start = "2024-05-" + str(20 + add_day) + "T" + str(13 + add_hour) + ":00" # edit str(xx) for start day
            finish = "2024-05-" + str(20 + add_day) + "T" + str(13 + add_hour + 1) + ":00"
            payload["published_after"] = start
            payload["published_before"] = finish
            
            urls = get_urls(payload)
            for i in range(len(urls)):
                print(urls[i])
                content = get_article_content(urls[i])
                if content != "Invalid Response": 
                    filename = os.path.join("articles", "article" + str(add_day) + str(add_hour) + str(i) + "__.txt")
                    write_to_file(filename, content)
                else:
                    print("Above URL Skipped")

In [20]:
create_documents()

https://finance.yahoo.com/news/computer-technology-stocks-lagging-commvault-134012588.html?.tsrc=rss
https://finance.yahoo.com/news/chinese-ev-maker-li-auto-132326529.html
https://finance.yahoo.com/news/history-says-nvidia-stock-split-143051089.html
https://finance.yahoo.com/news/jim-cramer-destroys-starbucks-ceo-143019932.html
https://finance.yahoo.com/news/buy-snowflake-snow-stock-ahead-155900617.html?.tsrc=rss
https://finance.yahoo.com/news/googles-1-1b-boost-finnish-155809461.html?.tsrc=rss
https://finance.yahoo.com/news/billionaire-bill-ackman-10-8-155759546.html
https://finance.yahoo.com/news/microsoft-corporation-nasdaq-msft-best-164343928.html?.tsrc=rss
https://finance.yahoo.com/news/blackrock-whistleblower-alleges-cover-search-164044668.html
https://finance.yahoo.com/news/jeff-bezos-quietly-enters-residential-163142437.html
https://finance.yahoo.com/news/tesla-layoffs-continue-june-morale-175645606.html?.tsrc=rss
https://finance.yahoo.com/news/warren-buffett-says-rains-gold-17

In [128]:
def get_query_engine(llm):
    set_global_tokenizer(
        AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf").encode
    )
    embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
    
    yf_documents = SimpleDirectoryReader(input_dir="articles").load_data()
    yf_index = VectorStoreIndex.from_documents(yf_documents, embed_model=embed_model)
    yf_query_engine = yf_index.as_query_engine(llm=llm)
    
    return yf_documents, yf_index, yf_query_engine

In [132]:
def get_stocks(llm, user_prefs):
    docs, index, query_engine = get_query_engine(llm)
    
    risk_averse_prompt = "Gather the financial securities that are most mentioned across the articles in a positive sentiment." + \
    f"The securities gathered can be stocks or ETFs. Return a list of {user_prefs['num_stocks']} securities in which no more than half of the " + \
    "securities are from the same industry. This list of securities should also be tailored to a very risk averse investor who " + \
    "is more focused on stability instead of rapid growth. "
    if user_prefs['industries']:
        risk_averse_prompt += f"Include multiple stocks with a strong focus on the following industry or industries: {user_prefs['industries']}. "
    risk_averse_prompt += "Create this list so that they can be put together to make a diversified portfolio. " + \
    "List the stock name, stock ticker in parentheses, and the industry associated with it" + \
    "and double check that a majority of the industries aren't the same."
    
    risk_neutral_prompt = "Gather the financial securities that are most mentioned across the articles in a positive sentiment." + \
    f"The securities gathered can be stocks or ETFs. Return a list of {user_prefs['num_stocks']} securities in which no more than half of the " + \
    "securities are from the same industry. "
    if user_prefs['industries']:
        risk_averse_prompt += f"Include multiple stocks with a strong focus on the following industry or industries: {user_prefs['industries']}. "
    risk_averse_prompt += "Create this list so that they can be put together to make a diversified portfolio. " + \
    "List the stock name, stock ticker in parentheses, and the industry associated with it" + \
    "and double check that a majority of the industries aren't the same."
    
    risk_tolerant_prompt = "Gather the financial securities that are most mentioned across the articles in a positive sentiment." + \
    f"The securities gathered can be stocks or ETFs. Return a list of {user_prefs['num_stocks']} securities in which no more than half of the " + \
    "securities are from the same industry. This list of securities should also be tailored to a very risk tolerant investor who " + \
    "is more focused on rapid growth instead of stability. This list should include growth stocks with high expected returns " + \
    "with the potential for greater risk. Create this list so that they can be put together to make a diversified portfolio. "
    if user_prefs['industries']:
        risk_tolerant_prompt += f"Include multiple stocks with a strong focus on the following industry or industries: {industries}. " 
    risk_tolerant_prompt += "List the stock name, stock ticker in parentheses, and the industry associated with it" + \
    "and double check that a majority of the industries aren't the same."
    
    prompt_dict = {
        "risk_averse": risk_averse_prompt, 
        "risk_neutral": risk_neutral_prompt,
        "risk_tolerant": risk_tolerant_prompt
    }
    prompt = None
    
    if user_prefs["risk"] == "conservative":
        prompt = prompt_dict["risk_averse"]
    elif user_prefs["risk"] == "aggressive":
        prompt = prompt_dict["risk_tolerant"]
    else:
        prompt = prompt_dict["risk_neutral"]
    
    response = query_engine.query(prompt)
    
    pattern = r'\(([A-Za-z]+)\)'
    stocks = re.findall(pattern, response.response)
    
    return stocks

In [99]:
risk_options = ["conservative", "moderate", "aggressive"]
industry_options = ["Energy", "Materials", "Industrials", "Utilities", "Healthcare", "Financials", "Consumer Discretionary",
                    "Consumer Staples", "Technology", "Communication Services", "Real Estate"]
return_goals = []
user_prefs = {
    "num_stocks": None, 
    "risk": None, 
    "industries": None, 
    "return_goals": None
}

In [33]:
LLAMA2_7B_CHAT = "meta-llama/Llama-2-7b-chat-hf"
selected_model = LLAMA2_7B_CHAT

In [103]:
user_prefs = {"num_stocks": 10, "risk": "conservative", "industries": None, "return_goals": None}

In [130]:
# llm = get_llm(selected_model)
get_stocks(llm, user_prefs)

['VOO', 'SPY', 'ACWI', 'EEM', 'VTI', 'IWM', 'DIA', 'REIT', 'VNQ', 'ACWX']

In [None]:
# only tech stocks except for one
# after getting new data, outputted 5 stocks with good variance

In [26]:
prompt = "Return the financial securities that are most mentioned across the articles in a positive sentiment." + \
"The securities returned can be stocks or ETFs. Ensure that the securities returned are of different industries " + \
"that can be put together to make a diversified portfolio. List the stock and the industry associated with it" + \
"and double check that a majority of the industries aren't the same."

response_3 = yf_query_engine.query(prompt)
response_3.print_response_stream()

Based on the provided context information, the financial securities that are most mentioned in a positive sentiment across the articles are:

1. Stock: Vanguard S&P 500 ETF (VOO) - Industry: Finance
2. Stock: Nvidia (NVDA) - Industry: Technology
3. Stock: SPDR S&P 500 ETF Trust (SPY) - Industry: Finance
4. Stock: MPLX LP (MPLX) - Industry: Energy
5. Stock: Vanguard Total Stock Market ETF (VTI) - Industry: Finance

These securities are from different industries, and a diversified portfolio can be created by investing in them. VOO and SPY are large-cap ETFs that track the S&P 500 index, which covers a wide range of industries. NVDA is a technology stock that has been a top performer in recent years. MPLX is an energy midstream company that has a high-yield dividend and has been actively acquiring and growing its business. VTI is a total stock market ETF that invests in a broad range of stocks across different industries.

Note: The information provided is based on the articles provided i

In [None]:
# this prompt only gave tech stocks excpet for one

In [27]:
prompt_2 = "Gather the financial securities that are most mentioned across the articles in a positive sentiment." + \
"The securities gathered can be stocks or ETFs. Return a list of 10 securities in which no more than half of the " + \
"securities are from the same industry. Create this list so that they " + \
"can be put together to make a diversified portfolio. List the stock and the industry associated with it" + \
"and double check that a majority of the industries aren't the same."

response_4 = yf_query_engine.query(prompt_2)
response_4.print_response_stream()

Based on the articles provided, the following are the financial securities that are most mentioned in a positive sentiment:

1. SPDR S&P 500 ETF Trust (Stock - Industry: Financials)
2. Vanguard S&P 500 ETF (Stock - Industry: Financials)
3. MPLX LP (Stock - Industry: Energy)
4. NVIDIA Corporation (Stock - Industry: Technology)
5. Vanguard Total Stock Market ETF (ETF - Industry: Financials)
6. iShares MSCI ACWI ETF (ETF - Industry: Financials)
7. SPDR Dow Jones Industrial Average ETF (ETF - Industry: Industrials)
8. Invesco QQQ ETF (ETF - Industry: Technology)
9. Vanguard Real Estate ETF (ETF - Industry: Real Estate)
10. iShares U.S. Medical Devices ETF (ETF - Industry: Healthcare)

Notes:

* No more than half of the securities are from the same industry.
* The securities are listed in no particular order.
* The industries are listed in alphabetical order.

Double check:

* The majority of the industries are not the same.

These securities can be combined to create a diversified portfoli

In [None]:
# below prompt seems to provide best portfolio so far

In [49]:
new_prompt_risk_averse = "Gather the financial securities that are most mentioned across the articles in a positive sentiment." + \
"The securities gathered can be stocks or ETFs. Return a list of 10 securities in which no more than half of the " + \
"securities are from the same industry. This list of securities should also be tailored to a very risk averse investor who " + \
"is more focused on stability instead of rapid growth. " + \
"Create this list so that they " + \
"can be put together to make a diversified portfolio. List the stock name, stock ticker in parentheses, and the industry associated with it" + \
"and double check that a majority of the industries aren't the same."

response_risk = yf_query_engine.query(new_prompt_risk_averse)
response_risk.print_response_stream()

Based on the articles provided, here are 10 financial securities that are mentioned in a positive sentiment:

1. Vanguard S&P 500 ETF (VOO) - Industry: Financials
2. SPDR S&P 500 ETF Trust (SPY) - Industry: Financials
3. iShares MSCI ACWI ETF (ACWI) - Industry: Financials
4. iShares MSCI Emerging Markets ETF (EEM) - Industry: Materials
5. Vanguard Total Stock Market ETF (VTI) - Industry: Utilities
6. iShares Russell 2000 ETF (IWM) - Industry: Consumer Goods
7. SPDR Dow Jones Industrial Average ETF (DIA) - Industry: Consumer Goods
8. iShares NAASDAQ Biotechnology ETF (BBTH) - Industry: Healthcare
9. Vanguard Real Estate ETF (VNQ) - Industry: Real Estate
10. iShares TIPS Bond ETF (TIP) - Industry: Utilities

Notes:

* No more than half of the securities are from the same industry.
* The securities are tailored to a very risk-averse investor who is more focused on stability instead of rapid growth.
* The list is diversified across different industries to minimize risk.
* The securities ar

In [58]:
new_prompt_risk_tolerant = "Gather the financial securities that are most mentioned across the articles in a positive sentiment." + \
"The securities gathered can be stocks or ETFs. Return a list of 10 securities in which no more than half of the " + \
"securities are from the same industry. This list of securities should also be tailored to a very risk tolerant investor who " + \
"is more focused on rapid growth instead of stability. " + \
"Create this list so that they " + \
"can be put together to make a diversified portfolio. List the stock name, stock ticker in parentheses, and the industry associated with it" + \
"and double check that a majority of the industries aren't the same."

response_no_risk = yf_query_engine.query(new_prompt_risk_tolerant)
response_no_risk.print_response_stream()

Exception in thread Thread-15:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/threading.py", line 954, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.9/threading.py", line 892, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/opt/conda/lib/python3.9/site-packages/transformers/generation/utils.py", line 1592, in generate
    return self.sample(
  File "/opt/conda/lib/python3.9/site-packages/transformers/generation/utils.py", line 2696, in sample
    outputs = self(
  File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/conda/lib/python3.9/site-packag

Based on the articles provided, the following are the financial securities that are most mentioned in a positive sentiment:

1. Vanguard S&P 500 ETF (VOO) - Industry: Financials
2. SPDR S&P 500 ETF Trust (SPY) - Industry: Financials
3. NVIDIA Corporation (NVDA) - Industry: Technology
4. Vanguard Information Technology ETF (VIT) - Industry: Technology
5. iShares MSCI ACWI ETF (ACWI) - Industry: Financials
6. Invesco QQQ ETF (QQQ) - Industry: Technology
7. iShares PHLX SOX ETF (SOXX) - Industry: Technology
8. ProShares UltraPro QQQ (TQQQ) - Industry: Technology
9. Vanguard FTSE Europe ETF (VGK) - Industry: Financials
10. iShares MSCI Emerging Markets ETF (EEM) - Industry: Financials

Note: The industries listed are based on the primary sector associated with each security, and may not reflect the security's exact industry classification.

This list of securities is tailored to a very risk tolerant investor who is more focused on rapid growth instead of stability. The majority of the indu

In [82]:
industries = ["industrials", "communication services", "real estate"]

In [83]:
new_prompt_risk_tolerant = "Gather the financial securities that are most mentioned across the articles in a positive sentiment." + \
"The securities gathered can be stocks or ETFs. Return a list of 10 securities in which no more than half of the " + \
"securities are from the same industry. This list of securities should also be tailored to a very risk tolerant investor who " + \
"is more focused on rapid growth instead of stability. This list should include growth stocks with high expected returns " + \
"with the potential for greater risk. Create this list so that they " + \
"can be put together to make a diversified portfolio. Include multiple stocks with a strong focus on the " + \
f"following industry or industries: {industries}. " + \
"List the stock name, stock ticker in parentheses, and the industry associated with it" + \
"and double check that a majority of the industries aren't the same."

response_no_risk = yf_query_engine.query(new_prompt_risk_tolerant)
response_no_risk.print_response_stream()

Based on the provided articles, the following are the financial securities that are most mentioned in a positive sentiment:

1. SPDR S&P 500 ETF Trust (SPY) - Industry: Industrials
2. Vanguard S&P 500 ETF (VOO) - Industry: Industrials
3. NVIDIA Corporation (NVDA) - Industry: Technology
4. Amazon.com, Inc. (AMZN) - Industry: Technology
5. Microsoft Corporation (MSFT) - Industry: Technology
6. Alphabet Inc. (GOOGL) - Industry: Technology
7. Facebook, Inc. (FB) - Industry: Communication Services
8. Johnson & Johnson (JNJ) - Industry: Healthcare
9. Procter & Gamble Company (PG) - Industry: Consumer Goods
10. Real Estate Select Sector SPDR Fund (XLRE) - Industry: Real Estate

Note: The list of securities is tailored to a very risk-tolerant investor who is more focused on rapid growth instead of stability. The list includes growth stocks with high expected returns, but also includes some stable companies from different industries to provide diversification.

In [56]:
prompts_dict = yf_query_engine.get_prompts()
print(list(prompts_dict.keys()))

['response_synthesizer:text_qa_template', 'response_synthesizer:refine_template']


In [60]:
from llama_index.core import PromptTemplate

In [65]:
text_qa_template_str = (
    "Context information is"
    " below.\n---------------------\n{context_str}\n---------------------\nUsing"
    " both the context information and also using your own knowledge, answer"
    " the request about portfolio recommendations: {query_str}\nIf the context isn't helpful, you can also"
    " answer the request on your own.\nWhen answering,"
    " respond in the following format:\nSTOCK NAME (STOCK TICKER) - INDUSTRY\n and"
    " follow that format for each stock in the portfolio. The words in all capitals should be replaced with"
    " the actual stock name, the stock ticker, and industry, respectively. Only include the stock recommendations,"
    " do not include any filler text or notes at the beginning or end of the response."
)
text_qa_template = PromptTemplate(text_qa_template_str)

In [66]:
# should test adding system prompt and incorporating user preferences like risk aversion and industry preferences

In [67]:
yf_query_engine_prompt = yf_index.as_query_engine(llm=llm, streaming=True, text_qa_template=text_qa_template)

In [64]:
response_risk_prompt = yf_query_engine.query(new_prompt_risk_averse)
response_risk_prompt.print_response_stream()

Based on the articles provided, here are 10 financial securities that are mentioned in a positive sentiment:

1. Vanguard S&P 500 ETF (VOO) - Industry: Financials
2. SPDR S&P 500 ETF Trust (SPY) - Industry: Financials
3. iShares MSCI ACWI ETF (ACWI) - Industry: Financials
4. iShares MSCI Emerging Markets ETF (EEM) - Industry: Materials
5. Vanguard Total Stock Market ETF (VTI) - Industry: Utilities
6. iShares Core S&P Total U.S. Stock Market ETF (ITOT) - Industry: Consumer Goods
7. SPDR Dow Jones Industrial Average ETF (DIA) - Industry: Consumer Goods
8. iShares NAREIT ETF (REIT) - Industry: Real Estate
9. Vanguard Real Estate ETF (VNQ) - Industry: Real Estate
10. iShares MSCI ACWI ex US ETF (ACWX) - Industry: Healthcare

Notes:

* No more than half of the securities are from the same industry.
* The securities are tailored to a very risk-averse investor who is more focused on stability instead of rapid growth.
* The list is diversified across different industries to minimize risk.
* Th

In [68]:
response_no_risk_prompt = yf_query_engine_prompt.query(new_prompt_risk_tolerant)
response_no_risk_prompt.print_response_stream()

Based on the provided context, here are 10 stock recommendations for a diversified portfolio of growth stocks with high expected returns and potential for greater risk:

1. Alibaba Group Holding Limited (BABA) - Technology
2. Amazon.com, Inc. (AMZN) - Technology
3. Netflix, Inc. (NFLX) - Technology
4. Alphabet Inc. (GOOGL) - Technology
5. NVIDIA Corporation (NVDA) - Technology
6. Tesla, Inc. (TSLA) - Technology
7. Square, Inc. (SQ) - Finance
8. Shopify Inc. (SHOP) - E-commerce
9. Zoom Video Communications, Inc. (ZM) - Technology
10. CrowdStrike Holdings, Inc. (CRWD) - Technology

These stocks are from different industries and are expected to have high growth potential, but they also come with a higher level of risk. It's important to note that this is just a sample list and it's always recommended to do your own research and consult with a financial advisor before making any investment decisions.

In [29]:
from dual_dcf_model import discounted_cash_flow

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
GE Aerospace
industrials
aerospace-defense
X Unavailable Cash Flow: AccountsPayableCurrent
X Strong Sell 0
AAPL Unavailable Cash Flow: AccountsReceivableNetCurrent
AAPL Strong Buy 3
META Strong Buy 5
MA Sell 0
BA Strong Buy 3
