In [None]:
!pip install streamlit yfinance openai googlesearch-python gtts beautifulsoup4 requests get-all-tickers langchain_community sentence_transformers chromadb

In [1]:
import yfinance as yf
import pandas as pd
# from langchain_community.vectorstores import Chroma
# from langchain_community.embeddings import HuggingFaceEmbeddings
import os
import time
from random import uniform
import random

from langchain_huggingface import HuggingFaceEmbeddings  
from langchain_chroma import Chroma 

In [None]:

# Load  tickers
nasdaq_url = "https://www.nasdaqtrader.com/dynamic/SymDir/nasdaqlisted.txt"
nasdaq_df = pd.read_csv(nasdaq_url, sep='|')
all_tickers = nasdaq_df['Symbol'].tolist()

data = []

def fetch_info_with_retry(ticker, max_retries=2, initial_wait=2):
    retries = 0
    while retries < max_retries:
        try:
            info = yf.Ticker(ticker).info
            return {
                "Ticker": ticker,
                "Sector": info.get("sector", "N/A"),
                "Industry": info.get("industry", "N/A"),
                "Description": info.get("longBusinessSummary", "N/A")
            }
        except Exception as e:
            retries += 1
            wait = initial_wait * (2 ** (retries - 1)) + random.uniform(0, 1)
            print(f"[Retry {retries}] Error for {ticker}: {e} — retrying in {wait:.2f}s...")
            time.sleep(wait)
    print(f"[Skipped] Max retries exceeded for {ticker}")
    return None

for idx, ticker in enumerate(all_tickers, 1):
    result = fetch_info_with_retry(ticker)
    if result is not None:
        data.append(result)

    if idx % 100 == 0:
        print(f"> Processed {idx} tickers...")

# Convert to DataFrame and clean
df = pd.DataFrame(data)
df.replace("N/A", pd.NA, inplace=True)
df.dropna(inplace=True)

print(f"\n✅ Retrieval complete. Final dataset contains {len(df)} entries.")

> Processed 100 tickers...
> Processed 200 tickers...


In [3]:
# df
# df.to_csv("stock_info.csv", index=False)


df = pd.read_csv("stock_info.csv")

In [5]:




# ----------------------------------------
# 0. Load or ensure df is in memory
# ----------------------------------------
# df = pd.read_pickle("metadata.pkl")  # Uncomment if loading from file

# -----------------------------
# 1. Embedding Model
# -----------------------------
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# -----------------------------
# 2. Prepare Texts + Metadata
# -----------------------------
texts = []
metadatas = []

for _, row in df.iterrows():
    text = f"{row['Ticker']} - {row['Description']}"
    metadata = {
        "ticker": row['Ticker'],
        "sector": row['Sector'],
        "industry": row['Industry']
    }
    texts.append(text)
    metadatas.append(metadata)

# -----------------------------
# 3. Define DB Directory
# -----------------------------
persist_dir = "company_vectors"


# -----------------------------
# 4. Create & Save Vector DB
# -----------------------------
vectorstore = Chroma.from_texts(
    texts=texts,
    embedding=embedding_model,
    metadatas=metadatas,
    persist_directory=persist_dir  # ❗ Let Chroma create this
)


print("✅ Vector DB created and saved to disk.")



✅ Vector DB created and saved to disk.


In [6]:
#vectorDB now loaded on Hugging Face Hub
#https://huggingface.co/datasets/gargumang411/company_vectors/tree/main

# -----------------------------
# 5. Run a Sample Query (Optional)

# -----------------------------

embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

vectorstore = Chroma(
    persist_directory="company_vectors",
    embedding_function=embedding_model
)
query = "What does paypal do?"
results = vectorstore.similarity_search(query, k=3)

print("\n🔍 Query Result:")
for res in results:
    print(f"\n📄 {res.page_content}\n🧾 Metadata: {res.metadata}")


🔍 Query Result:

📄 PYPL - PayPal Holdings, Inc. operates a technology platform that enables digital payments for merchants and consumers worldwide. It operates a two-sided network at scale that connects merchants and consumers that enables its customers to connect, transact, and send and receive payments through online and in person, as well as transfer and withdraw funds using various funding sources, such as bank accounts, PayPal or Venmo account balance, consumer credit products, credit and debit cards, and cryptocurrencies, as well as other stored value products, including gift cards and eligible rewards. The company provides payment solutions under the PayPal, PayPal Credit, Braintree, Venmo, Xoom, Zettle, Hyperwallet, Honey, and Paidy names. The company was founded in 1998 and is headquartered in San Jose, California.
🧾 Metadata: {'industry': 'Credit Services', 'sector': 'Financial Services', 'ticker': 'PYPL'}

📄 PYPL - PayPal Holdings, Inc. operates a technology platform that e