In [None]:
# !pip install accelerate
# !pip install -i https://pypi.org/simple/ bitsandbytes

# !pip install llama-index-embeddings-huggingface

# !pip install llama-index

# !pip install llama-index-llms-huggingface

In [1]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core import set_global_tokenizer
from transformers import AutoTokenizer
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

In [2]:
import torch
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import PromptTemplate




In [3]:
LLAMA2_7B_CHAT = "meta-llama/Llama-2-7b-chat-hf"

In [4]:
selected_model = LLAMA2_7B_CHAT

In [5]:
SYSTEM_PROMPT = """You are an AI assistant that answers questions in a friendly manner, based on the given source documents. Here are some rules you always follow:
- Generate human readable output, avoid creating output with gibberish text.
- Generate only the requested output, don't include any other language before or after the requested output.
- Never say thank you, that you are happy to help, that you are an AI agent, etc. Just answer directly.
- Generate professional language typically used in business documents in North America.
- Never generate offensive or foul language.
"""

query_wrapper_prompt = PromptTemplate(
    "[INST]<<SYS>>\n" + SYSTEM_PROMPT + "<</SYS>>\n\n{query_str}[/INST] "
)

llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=2048,
    generate_kwargs={"temperature": 0.1, "do_sample": True},
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name=selected_model,
    model_name=selected_model,
    device_map="auto",
    # change these settings below depending on your GPU
    model_kwargs={"torch_dtype": torch.float16, "load_in_8bit": True},
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
import time

In [None]:
start = time.time()

print(llm.complete("In Python, make a function that prints all even numbers between 0 and 10 inclusive.").text)

print(f"Time elapsed: {time.time() - start}")

In [12]:
set_global_tokenizer(
    AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf").encode
)

In [7]:
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [8]:
document = SimpleDirectoryReader(input_files=["essay-data/" + 'essay.txt']).load_data()
index = VectorStoreIndex.from_documents(document, embed_model=embed_model)
query_engine = index.as_query_engine(llm=llm)

In [13]:
start = time.time()
r = query_engine.query("What did the author do growing up?")
# r.print_response_stream()
print(r.response)
print()
print(time.time() - start)

KeyboardInterrupt: 

In [None]:
import requests
import json
from bs4 import BeautifulSoup
import os

In [None]:
API_TOKEN = "rxfMXxfsdsKGb8urL8shecBbBp37ZBuvU5zo39sm"
API_ENDPOINT = f"https://api.marketaux.com/v1/news/all"

payload = {
    "api_token": API_TOKEN,
    "entity_types": "equity,index,etf", # should include industries parameter
    "industries": "Technology",
    "countries": "us",
    "published_after": "2024-05-10T15:00",
    "domains": "finance.yahoo.com"
}

In [None]:
payload = {
    "api_token": API_TOKEN,
    "entity_types": "equity,index,etf", # should include industries parameter
#     "industries": "Technology",
    "countries": "us",
    "published_after": "2024-05-10T13:00",
    "published_before": "2024-05-10T14:00",
    "domains": "finance.yahoo.com"
}

In [None]:
def get_urls(payload):
    r = requests.get("https://api.marketaux.com/v1/news/all", params=payload)
    output = r.text
    json_output = json.loads(output)
    data = json_output['data']
    
    urls = []
    
    for i in range(len(data)):
        url = data[i]['url']
        if url.startswith("https://finance.yahoo.com/news"):
            urls.append(url)
        
    return urls

In [None]:
def get_article_content(url):
    r = requests.get(url)
    
    if r.status_code != 200:
        return 'Invalid Response'
    
    soup = BeautifulSoup(r.content)
    body = soup.find('div', attrs={'class': 'caas-body'})
    content = ""
    
    for text in body.find_all('p'):
        content += text.text
    
    return content

In [None]:
def write_to_file(filename, content):
    with open(filename, 'w') as file:
        file.write(content)

In [None]:
def create_documents():
    payload = {
        "api_token": API_TOKEN,
        "entity_types": "equity,index,etf", # should include industries parameter
    #     "industries": "Technology",
        "countries": "us",
#         "published_after": "2024-05-10T13:00",
#         "published_before": "2024-05-10T14:00",
        "domains": "finance.yahoo.com"
    }
    
    for add_day in range(5):
        for add_hour in range(7):
            start = "2024-05-" + str(10 + add_day) + "T" + str(13 + add_hour) + ":00"
            finish = "2024-05-" + str(10 + add_day) + "T" + str(13 + add_hour + 1) + ":00"
            payload["published_after"] = start
            payload["published_before"] = finish
            
            urls = get_urls(payload)
            for i in range(len(urls)):
                print(urls[i])
                content = get_article_content(urls[i])
                if content != "Invalid Response": 
                    filename = os.path.join("articles", "article" + str(add_day) + str(add_hour) + str(i) + ".txt")
                    write_to_file(filename, content)
                else:
                    print("Above URL Skipped")

In [None]:
create_documents()

In [None]:
r = requests.get("https://finance.yahoo.com/video/apple-catching-ai-revolution-dan-142925403.html?.tsrc=rss")
s = BeautifulSoup(r.content)
body = s.find('div', attrs={'class': 'caas-body'})
body

In [None]:
type(r.status_code)

In [None]:
url = "https://finance.yahoo.com/news/q1-2024-applied-optoelectronics-inc-163026968.html?.tsrc=rss"
r = requests.get(url)
soup = BeautifulSoup(r.content)
body = soup.find('div')

In [None]:
r.reason

In [None]:
yf_documents = SimpleDirectoryReader(input_dir="articles").load_data()
yf_index = VectorStoreIndex.from_documents(yf_documents, embed_model=embed_model)
yf_query_engine = yf_index.as_query_engine(llm=llm, streaming=True)

In [None]:
stocks = yf_query_engine.query("Based on the ingested articles, return the best stocks to invest in and give reasons why.")
stocks.print_response_stream()

In [None]:
response_2 = yf_query_engine.query("Based on the ingested articles, return the 5 best stocks to invest in and give reasons why.")
response_2.print_response_stream()

In [None]:
response_3 = yf_query_engine.query("Based on the ingested articles, return a set of stocks that would best diversify a portfolio with the best returns.")
response_3.print_response_stream()

In [None]:
new_prompt = "Return the financial securities that are most mentioned across the articles in a positive sentiment." + \
"The securities returned can be stocks or ETFs. Ensure that the securities returned are of different industries " + \
"that can be put together to make a diversified portfolio. List the stock and the industry associated with it" + \
"and double check that a majority of the industries aren't the same."

response_3 = yf_query_engine.query(new_prompt)
response_3.print_response_stream()

In [None]:
new_prompt = "Gather the financial securities that are most mentioned across the articles in a positive sentiment." + \
"The securities gathered can be stocks or ETFs. Return a list of 12 securities in which no more than half of the " + \
"securities are from the same industry. Create this list so that they " + \
"can be put together to make a diversified portfolio. List the stock and the industry associated with it" + \
"and double check that a majority of the industries aren't the same."

response_3 = yf_query_engine.query(new_prompt)
response_3.print_response_stream()

In [None]:
response_3.response_txt

In [None]:
new_prompt_risk_averse = "Gather the financial securities that are most mentioned across the articles in a positive sentiment." + \
"The securities gathered can be stocks or ETFs. Return a list of 12 securities in which no more than half of the " + \
"securities are from the same industry. This list of securities should also be tailored to a very risk averse investor who " + \
"is more focused on stability instead of rapid growth. " + \
"Create this list so that they " + \
"can be put together to make a diversified portfolio. List the stock and the industry associated with it" + \
"and double check that a majority of the industries aren't the same."

response_risk= yf_query_engine.query(new_prompt_risk_averse)
response_risk.print_response_stream()

In [None]:
prompts_dict = yf_query_engine.get_prompts()
print(list(prompts_dict.keys()))

In [None]:
import re

pattern = r'\(([^)]+)\)'
matches = re.findall(pattern, response_3.response_txt)

In [None]:
matches

In [None]:
from llama_index.core import PromptTemplate

text_qa_template_str = (
    "Context information is"
    " below.\n---------------------\n{context_str}\n---------------------\nUsing"
    " both the context information and also using your own knowledge, answer"
    " the question: {query_str}\nIf the context isn't helpful, you can also"
    " answer the question on your own.\n"
)
text_qa_template = PromptTemplate(text_qa_template_str)

refine_template_str = (
    "The original question is as follows: {query_str}\nWe have provided an"
    " existing answer: {existing_answer}\nWe have the opportunity to refine"
    " the existing answer (only if needed) with some more context"
    " below.\n------------\n{context_msg}\n------------\nUsing both the new"
    " context and your own knowledge, update or repeat the existing answer.\n"
)
refine_template = PromptTemplate(refine_template_str)

In [None]:
text_qa_template_str = (
    "Context information is"
    " below.\n---------------------\n{context_str}\n---------------------\nUsing"
    " both the context information and also using your own knowledge, answer"
    " the question: {query_str}\nIf the context isn't helpful, you can also"
    " answer the question on your own.\n"
)

In [None]:
print(text_qa_template_str)

In [None]:
# should test adding system prompt and incorporating user preferences like risk aversion and industry preferences