In [1]:
! pip install yfinance langchain_pinecone openai python-dotenv langchain-community sentence_transformers



In [2]:
from langchain_pinecone import PineconeVectorStore
from openai import OpenAI
import dotenv
import json
import yfinance as yf
import concurrent.futures
from langchain_community.embeddings import HuggingFaceEmbeddings
from google.colab import userdata
from langchain.schema import Document
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone
import numpy as np
import requests
import os

In [3]:
def get_stock_info(symbol: str) -> dict:
    """
    Retrieves and formats detailed information about a stock from Yahoo Finance.

    Args:
        symbol (str): The stock ticker symbol to look up.

    Returns:
        dict: A dictionary containing detailed stock information, including ticker, name,
              business summary, city, state, country, industry, and sector.
    """
    data = yf.Ticker(symbol)
    stock_info = data.info

    properties = {
        "Ticker": stock_info.get('symbol', 'Information not available'),
        'Name': stock_info.get('longName', 'Information not available'),
        'Business Summary': stock_info.get('longBusinessSummary'),
        'City': stock_info.get('city', 'Information not available'),
        'State': stock_info.get('state', 'Information not available'),
        'Country': stock_info.get('country', 'Information not available'),
        'Industry': stock_info.get('industry', 'Information not available'),
        'Sector': stock_info.get('sector', 'Information not available')
    }

    return properties

In [4]:
data = yf.Ticker("NVDA")
stock_info = data.info
print(stock_info)

{'address1': '2788 San Tomas Expressway', 'city': 'Santa Clara', 'state': 'CA', 'zip': '95051', 'country': 'United States', 'phone': '408 486 2000', 'website': 'https://www.nvidia.com', 'industry': 'Semiconductors', 'industryKey': 'semiconductors', 'industryDisp': 'Semiconductors', 'sector': 'Technology', 'sectorKey': 'technology', 'sectorDisp': 'Technology', 'longBusinessSummary': "NVIDIA Corporation provides graphics and compute and networking solutions in the United States, Taiwan, China, Hong Kong, and internationally. The Graphics segment offers GeForce GPUs for gaming and PCs, the GeForce NOW game streaming service and related infrastructure, and solutions for gaming platforms; Quadro/NVIDIA RTX GPUs for enterprise workstation graphics; virtual GPU or vGPU software for cloud-based visual and virtual computing; automotive platforms for infotainment systems; and Omniverse software for building and operating metaverse and 3D internet applications. The Compute & Networking segment co

In [5]:
def get_huggingface_embeddings(text, model_name="sentence-transformers/all-mpnet-base-v2"):
    """
    Generates embeddings for the given text using a specified Hugging Face model.

    Args:
        text (str): The input text to generate embeddings for.
        model_name (str): The name of the Hugging Face model to use.
                          Defaults to "sentence-transformers/all-mpnet-base-v2".

    Returns:
        np.ndarray: The generated embeddings as a NumPy array.
    """
    model = SentenceTransformer(model_name)
    return model.encode(text)


def cosine_similarity_between_sentences(sentence1, sentence2):
    """
    Calculates the cosine similarity between two sentences.

    Args:
        sentence1 (str): The first sentence for similarity comparison.
        sentence2 (str): The second sentence for similarity comparison.

    Returns:
        float: The cosine similarity score between the two sentences,
               ranging from -1 (completely opposite) to 1 (identical).

    Notes:
        Prints the similarity score to the console in a formatted string.
    """
    # Get embeddings for both sentences
    embedding1 = np.array(get_huggingface_embeddings(sentence1))
    embedding2 = np.array(get_huggingface_embeddings(sentence2))

    # Reshape embeddings for cosine_similarity function
    embedding1 = embedding1.reshape(1, -1)
    embedding2 = embedding2.reshape(1, -1)

    # Calculate cosine similarity
    similarity = cosine_similarity(embedding1, embedding2)
    similarity_score = similarity[0][0]
    print(f"Cosine similarity between the two sentences: {similarity_score:.4f}")
    return similarity_score


# Example usage
sentence1 = "I like walking to the park"
sentence2 = "I like running to the office"

similarity = cosine_similarity_between_sentences(sentence1, sentence2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Cosine similarity between the two sentences: 0.6133


In [6]:
aapl_info = get_stock_info('AAPL')
print(aapl_info)

{'Ticker': 'AAPL', 'Name': 'Apple Inc.', 'Business Summary': 'Apple Inc. designs, manufactures, and markets smartphones, personal computers, tablets, wearables, and accessories worldwide. The company offers iPhone, a line of smartphones; Mac, a line of personal computers; iPad, a line of multi-purpose tablets; and wearables, home, and accessories comprising AirPods, Apple TV, Apple Watch, Beats products, and HomePod. It also provides AppleCare support and cloud services; and operates various platforms, including the App Store that allow customers to discover and download applications and digital content, such as books, music, video, games, and podcasts, as well as advertising services include third-party licensing arrangements and its own advertising platforms. In addition, the company offers various subscription-based services, such as Apple Arcade, a game subscription service; Apple Fitness+, a personalized fitness service; Apple Music, which offers users a curated listening experien

In [7]:
aapl_description = aapl_info['Business Summary']

company_description = "I want to find companies that make smartphones and are headquarted in California"

similarity = cosine_similarity_between_sentences(aapl_description, company_description)

Cosine similarity between the two sentences: 0.3635


# Get all the stocks in the stock market

First, we need to get the symbols (also known as tickers) of all the stocks in the stock market

In [8]:
def get_company_tickers():
    """
    Downloads and parses the Stock ticker symbols from the GitHub-hosted SEC company tickers JSON file.

    Returns:
        dict: A dictionary containing company tickers and related information.

    Notes:
        The data is sourced from the official SEC website via a GitHub repository:
        https://raw.githubusercontent.com/team-headstart/Financial-Analysis-and-Automation-with-LLMs/main/company_tickers.json
    """
    # URL to fetch the raw JSON file from GitHub
    url = "https://raw.githubusercontent.com/team-headstart/Financial-Analysis-and-Automation-with-LLMs/main/company_tickers.json"

    # Making a GET request to the URL
    response = requests.get(url)

    # Checking if the request was successful
    if response.status_code == 200:
        # Parse the JSON content directly
        company_tickers = json.loads(response.content.decode('utf-8'))

        # Optionally save the content to a local file for future use
        with open("company_tickers.json", "w", encoding="utf-8") as file:
            json.dump(company_tickers, file, indent=4)

        print("File downloaded successfully and saved as 'company_tickers.json'")
        return company_tickers
    else:
        print(f"Failed to download file. Status code: {response.status_code}")
        return None

company_tickers = get_company_tickers()

File downloaded successfully and saved as 'company_tickers.json'


In [9]:
company_tickers

{'0': {'cik_str': 1045810, 'ticker': 'NVDA', 'title': 'NVIDIA CORP'},
 '1': {'cik_str': 320193, 'ticker': 'AAPL', 'title': 'Apple Inc.'},
 '2': {'cik_str': 789019, 'ticker': 'MSFT', 'title': 'MICROSOFT CORP'},
 '3': {'cik_str': 1018724, 'ticker': 'AMZN', 'title': 'AMAZON COM INC'},
 '4': {'cik_str': 1652044, 'ticker': 'GOOGL', 'title': 'Alphabet Inc.'},
 '5': {'cik_str': 1326801, 'ticker': 'META', 'title': 'Meta Platforms, Inc.'},
 '6': {'cik_str': 1318605, 'ticker': 'TSLA', 'title': 'Tesla, Inc.'},
 '7': {'cik_str': 1067983,
  'ticker': 'BRK-B',
  'title': 'BERKSHIRE HATHAWAY INC'},
 '8': {'cik_str': 1046179,
  'ticker': 'TSM',
  'title': 'TAIWAN SEMICONDUCTOR MANUFACTURING CO LTD'},
 '9': {'cik_str': 1730168, 'ticker': 'AVGO', 'title': 'Broadcom Inc.'},
 '10': {'cik_str': 59478, 'ticker': 'LLY', 'title': 'ELI LILLY & Co'},
 '11': {'cik_str': 19617, 'ticker': 'JPM', 'title': 'JPMORGAN CHASE & CO'},
 '12': {'cik_str': 104169, 'ticker': 'WMT', 'title': 'Walmart Inc.'},
 '13': {'cik_str'

In [10]:
len(company_tickers)

9998

# Inserting stocks into Pinecone
1. Create an account on Pinecone.io

2. Create a new index called "stocks" and set the dimensions to 768. Leave the rest of the settings as they are.

In [11]:
pinecone_api_key = userdata.get("PINECONE_API_KEY")
os.environ['PINECONE_API_KEY'] = pinecone_api_key

index_name = "stocks"
namespace = "stock-descriptions"

hf_embeddings = HuggingFaceEmbeddings()
vectorstore = PineconeVectorStore(index_name=index_name, embedding=hf_embeddings)

  hf_embeddings = HuggingFaceEmbeddings()
  hf_embeddings = HuggingFaceEmbeddings()


# Sequential Processing
It will take very long to process all the stocks like this!


In [13]:
for idx, stock in company_tickers.items():
    stock_ticker = stock['ticker']
    stock_data = get_stock_info(stock_ticker)
    stock_description = stock_data['Business Summary']

    print(f"Processing stock {idx} / {len(company_tickers)} :", stock_ticker)

    vectorstore_from_documents = PineconeVectorStore.from_documents(
        documents=[Document(page_content=stock_description, metadata=stock_data)],
        embedding=hf_embeddings,
        index_name=index_name,
        namespace=namespace
    )

Processing stock 0 / 9998 : NVDA
Processing stock 1 / 9998 : AAPL
Processing stock 2 / 9998 : MSFT
Processing stock 3 / 9998 : AMZN
Processing stock 4 / 9998 : GOOGL
Processing stock 5 / 9998 : META
Processing stock 6 / 9998 : TSLA
Processing stock 7 / 9998 : BRK-B
Processing stock 8 / 9998 : TSM
Processing stock 9 / 9998 : AVGO
Processing stock 10 / 9998 : LLY
Processing stock 11 / 9998 : JPM
Processing stock 12 / 9998 : WMT
Processing stock 13 / 9998 : V
Processing stock 14 / 9998 : UNH
Processing stock 15 / 9998 : SPY
Processing stock 16 / 9998 : XOM
Processing stock 17 / 9998 : ORCL
Processing stock 18 / 9998 : MA
Processing stock 19 / 9998 : NVO
Processing stock 20 / 9998 : HD
Processing stock 21 / 9998 : COST
Processing stock 22 / 9998 : PG
Processing stock 23 / 9998 : JNJ
Processing stock 24 / 9998 : BAC
Processing stock 25 / 9998 : NFLX
Processing stock 26 / 9998 : CRM
Processing stock 27 / 9998 : ABBV
Processing stock 28 / 9998 : RCIT
Processing stock 29 / 9998 : CVX
Processin

KeyboardInterrupt: 

# Parallelizing


In [12]:
# Initialize tracking lists
successful_tickers = []
unsuccessful_tickers = []

# Load existing successful/unsuccessful tickers
try:
    with open('successful_tickers.txt', 'r') as f:
        successful_tickers = [line.strip() for line in f if line.strip()]
    print(f"Loaded {len(successful_tickers)} successful tickers")
except FileNotFoundError:
    print("No existing successful tickers file found")

try:
    with open('unsuccessful_tickers.txt', 'r') as f:
        unsuccessful_tickers = [line.strip() for line in f if line.strip()]
    print(f"Loaded {len(unsuccessful_tickers)} unsuccessful tickers")
except FileNotFoundError:
    print("No existing unsuccessful tickers file found")

Loaded 3294 successful tickers
Loaded 36 unsuccessful tickers


In [15]:
def process_stock(stock_ticker: str) -> str:
    # Skip if already processed
    if stock_ticker in successful_tickers:
        return f"Already processed {stock_ticker}"

    try:
        # Get and store stock data
        stock_data = get_stock_info(stock_ticker)
        stock_description = stock_data['Business Summary']

        # Store stock description in Pinecone
        vectorstore_from_texts = PineconeVectorStore.from_documents(
            documents=[Document(page_content=stock_description, metadata=stock_data)],
            embedding=hf_embeddings,
            index_name=index_name,
            namespace=namespace
        )

        # Track success
        with open('successful_tickers.txt', 'a') as f:
            f.write(f"{stock_ticker}\n")
        successful_tickers.append(stock_ticker)

        return f"Processed {stock_ticker} successfully"

    except Exception as e:
        # Track failure
        with open('unsuccessful_tickers.txt', 'a') as f:
            f.write(f"{stock_ticker}\n")
        unsuccessful_tickers.append(stock_ticker)

        return f"ERROR processing {stock_ticker}: {e}"

def parallel_process_stocks(tickers: list, max_workers: int = 10) -> None:
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_ticker = {
            executor.submit(process_stock, ticker): ticker
            for ticker in tickers
        }

        for future in concurrent.futures.as_completed(future_to_ticker):
            ticker = future_to_ticker[future]
            try:
                result = future.result()
                print(result)

                # Stop on error
                if result.startswith("ERROR"):
                    print(f"Stopping program due to error in {ticker}")
                    executor.shutdown(wait=False)
                    raise SystemExit(1)

            except Exception as exc:
                print(f'{ticker} generated an exception: {exc}')
                print("Stopping program due to exception")
                executor.shutdown(wait=False)
                raise SystemExit(1)

# Prepare your tickers
tickers_to_process = [company_tickers[num]['ticker'] for num in company_tickers.keys()]

# Process them
parallel_process_stocks(tickers_to_process, max_workers=10)

Processed TSM successfully
Processed GOOGL successfully
Processed META successfully
Processed NVDA successfully
Processed AMZN successfully
Processed TSLA successfully
Processed AVGO successfully
Processed AAPL successfully
Processed BRK-B successfully
Processed MSFT successfully
Processed SPY successfully
Processed V successfully
Processed XOM successfully
Processed NVO successfully
Processed LLY successfully
Processed JPM successfully
Processed UNH successfully
Processed WMT successfully
Processed HD successfully
Processed ORCL successfully
Processed MA successfully
Processed NFLX successfully
Processed RCIT successfully
Processed COST successfully
Processed PG successfully
Processed CVX successfully
Processed JNJ successfully
Processed TMUS successfully
Processed BAC successfully
Processed CRM successfully
Processed ABBV successfully
Processed SAP successfully
Processed ASML successfully
Processed KO successfully
Processed TM successfully
Processed ACN successfully
Processed WFC suc

KeyboardInterrupt: 

# Perform RAG

In [13]:
# Initialize Pinecone
pc = Pinecone(api_key=userdata.get("PINECONE_API_KEY"),)

# Connect to your Pinecone index
pinecone_index = pc.Index(index_name)

In [14]:
query = "What are some companies that manufacture consumer hardware?"

In [15]:
raw_query_embedding = get_huggingface_embeddings(query)

In [16]:
top_matches = pinecone_index.query(vector=raw_query_embedding.tolist(), top_k=10, include_metadata=True, namespace=namespace)

In [17]:
top_matches

{'matches': [{'id': '42744cee-417f-4ab5-b9ce-fb9d11d5a489',
              'metadata': {'Business Summary': 'Kimball Electronics, Inc. '
                                               'engages in the provision of '
                                               'electronics manufacturing, '
                                               'engineering, and supply chain '
                                               'support services to customers '
                                               'in the automotive, medical, '
                                               'and industrial end markets. '
                                               'The Company also offers '
                                               'contract manufacturing '
                                               'services, including '
                                               'engineering and supply chain '
                                               'support for the production of '
                 

In [18]:
contexts = [item['metadata']['text'] for item in top_matches['matches']]

In [19]:
augmented_query = "<CONTEXT>\n" + "\n\n-------\n\n".join(contexts[ : 10]) + "\n-------\n</CONTEXT>\n\n\n\nMY QUESTION:\n" + query

In [20]:
print(augmented_query)

<CONTEXT>
Kimball Electronics, Inc. engages in the provision of electronics manufacturing, engineering, and supply chain support services to customers in the automotive, medical, and industrial end markets. The Company also offers contract manufacturing services, including engineering and supply chain support for the production of electronic assemblies and other products, including non-electronic components, medical devices, medical disposables, and precision molded plastics, as well as automation, test, and inspection equipment primarily used in automotive, medical, and industrial applications. In addition, it is also involved in the production and testing of printed circuit board assemblies, as well as clean room assembly, cold chain, and product sterilization management activities. Further, the company offers design services and support, supply chain services and support, rapid prototyping and new product introduction support, product design and process validation and qualification,

In [21]:
import openai
print(openai.__version__)

1.57.0


Setting up Groq for RAG

In [22]:
client = OpenAI(
    base_url="https://api.groq.com/openai/v1",
    api_key=userdata.get("GROQ_API_KEY")
)

In [23]:
system_prompt = f"""You are an expert at providing answers about stocks. Please answer my question provided.
"""

llm_response = client.chat.completions.create(
    model="llama-3.1-70b-versatile",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": augmented_query}
    ]
)

response = llm_response.choices[0].message.content

In [24]:
print(response)

Several companies mentioned in the given context manufacture and distribute consumer hardware, including:

1. 3M Company - In their Consumer segment, they provide a range of consumer hardware such as home cleaning products, retail abrasives, picture hanging, and consumer air quality solutions.

2. Avnet, Inc. - In the Electronic Components segment, they provide embedded solutions such as technical design, integration, and assembly of embedded products, systems, and solutions.

3. Arrow Electronics, Inc. - They operate in two segments: Global Components and Global Enterprise Computing Solutions. The Global Components segment markets and distributes electronic components, including computing and memory products. The Global Enterprise Computing Solutions segment offers computing solutions, including datacenter, cloud, security, and analytics solutions.

However, please note that the primary focus of these companies may not be entirely on consumer hardware as they also cater to industrial,

In [25]:
class StockSearchEngine:
    def __init__(self, pinecone_index, namespace="stock-descriptions"):
        self.index = pinecone_index
        self.namespace = namespace

    def search(self, query, filters=None, top_k=10):
        """
        Search stocks based on natural language query with optional filters

        Args:
            query (str): Natural language search query
            filters (dict): Optional metadata filters
            top_k (int): Number of results to return
        """
        # Get query embedding
        query_embedding = get_huggingface_embeddings(query)

        # Prepare filter conditions if provided
        filter_conditions = {}
        if filters:
            for key, value in filters.items():
                if isinstance(value, tuple):
                    filter_conditions[f"metadata.{key}"] = {"$gte": value[0], "$lte": value[1]}
                else:
                    filter_conditions[f"metadata.{key}"] = value

        # Query Pinecone
        results = self.index.query(
            vector=query_embedding.tolist(),
            top_k=top_k,
            include_metadata=True,
            namespace=self.namespace,
            filter=filter_conditions if filter_conditions else None
        )

        # Format results
        formatted_results = []
        for match in results['matches']:
            result = {
                'score': match['score'],
                **match['metadata']
            }
            formatted_results.append(result)

        return formatted_results

    def analyze_results(self, results, query, client):
        """
        Analyze search results using LLM

        Args:
            results (list): Search results
            query (str): Original search query
            client: OpenAI client instance
        """
        contexts = [r['Business Summary'] for r in results if 'Business Summary' in r]
        augmented_query = (
            "<CONTEXT>\n" +
            "\n\n-------\n\n".join(contexts[:10]) +
            "\n-------\n</CONTEXT>\n\n\n\nMY QUESTION:\n" + query
        )

        response = client.chat.completions.create(
            model="llama-3.1-70b-versatile",
            messages=[
                {"role": "system", "content": "You are an expert at providing answers about stocks."},
                {"role": "user", "content": augmented_query}
            ]
        )

        return response.choices[0].message.content


In [26]:
# Example usage:
search_engine = StockSearchEngine(pinecone_index)

# Basic search
results = search_engine.search(
    "companies that build data centers",
    filters={
        'Sector': 'Technology',
        'Country': 'United States'
    }
)

# Get LLM analysis
analysis = search_engine.analyze_results(results, "companies that build data centers", client)

In [27]:
print(analysis)

There are several companies that specialize in building and designing data centers. Here are a few notable ones:

1. **Equinix Inc.** (NASDAQ: EQIX): Equinix is one of the largest data center operators in the world, with over 240 facilities across 66 global metros. They design, build, and operate data centers for a variety of industries, including tech, finance, and government.

2. **Digital Realty Trust** (NYSE: DLR): Digital Realty is a real estate investment trust (REIT) that focuses on building and acquiring data centers. They have over 280 facilities in 24 countries worldwide and serve a range of clients, from hyperscalers to small and medium-sized businesses.

3. **Interxion Holding NV** (NYSE: INXN): Interxion is a European data center company that operates over 50 facilities across 11 countries. They design and build data centers for a variety of industries, including finance, healthcare, and government.

4. **CoreSite Realty Corp.** (NYSE: COR): CoreSite is a data center opera