In [5]:
import os
import streamlit as st
import pickle
import time
import langchain
from langchain import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

In [7]:
from langchain_groq import ChatGroq
llm = ChatGroq(
    temperature=1.1,
    model_name="mixtral-8x7b-32768",
    max_tokens=500,
    groq_api_key=groq_api_key) 

### (1) Load data

In [8]:
import requests

def fetch_google_news(query):
    """
    Fetches Google News articles related to a query using SERP API.
    
    Args:
        query (str): The search term (e.g., company or topic name).

    Returns:
        tuple: A tuple containing a list of article dictionaries and a list of URLs.
    """
    # SERP API endpoint for Google News
    endpoint = "https://serpapi.com/search.json"
    
    # Parameters for the API request
    params = {
        "q": query,  # Search query
        "tbm": "nws",  # News tab
        "api_key": "dd1a6f46e79d6b769e725eeb073317239ae07c95b42a443169d964b8d7864227",  # Replace with your SERP API key
    }
    
    try:
        # Send the GET request to SERP API
        response = requests.get(endpoint, params=params)
        response.raise_for_status()  # Raise an error for bad HTTP response
        
        # Parse the JSON response
        data = response.json()
        articles = []
        urls = []
        
        # Extract article information
        for result in data.get("news_results", []):
            article = {
                "title": result.get("title"),
                "link": result.get("link"),
                "snippet": result.get("snippet"),
                "source": result.get("source"),
                "date": result.get("date"),
            }
            articles.append(article)
            urls.append(result.get("link"))
        
        return articles, urls
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching Google News: {e}")
        return [], []  # Return empty lists for articles and urls in case of error

# Example usage
if __name__ == "__main__":
    query = "C2C Advanced Systems"
    articles, urls = fetch_google_news(query)
    
    # Print the article details
    for i, article in enumerate(articles, 1):
        print(f"Article {i}:")
        print(f"Title: {article['title']}")
        print(f"Link: {article['link']}")
        print(f"Source: {article['source']}")
        print(f"Date: {article['date']}")
        print(f"Snippet: {article['snippet']}")
        print("-" * 80)
    
    # Print all URLs
    print("All Article URLs:")
    for url in urls:
        print(url)


Article 1:
Title: C2C Advanced Systems IPOs GMP Indicates 2X Listing Gains Ahead Of Nov. 22 Opening
Link: https://www.ndtvprofit.com/ipos/c2c-advanced-systems-ipo-opens-on-nov-22-latest-gmp-indicates-2x-listing-gain-check-details
Source: NDTV Profit
Date: 1 day ago
Snippet: Ahead of its launch, the GMP of C2C Advanced Systems IPO indicated a 
listing gain of 99.56% on the upper end of the price band.
--------------------------------------------------------------------------------
Article 2:
Title: C2C Advanced Systems' IPO Launch: A Glimpse into Defence Innovation
Link: https://www.devdiscourse.com/article/technology/3164020-c2c-advanced-systems-ipo-launch-a-glimpse-into-defence-innovation
Source: Devdiscourse
Date: 7 hours ago
Snippet: C2C Advanced Systems, a defence solutions provider, is launching an IPO 
with a price band of Rs 214-226 per equity share.
--------------------------------------------------------------------------------
Article 3:
Title: C2C Advanced Systems IPO: Plans

In [9]:
import requests
from bs4 import BeautifulSoup
from langchain.docstore.document import Document
from typing import List, Union, Tuple

def fetch_and_process_urls(urls: List[str]) -> Tuple[List[Document], List[str]]:
    """
    Fetch webpage content and convert to Langchain Document format
    
    Args:
        urls (List[str]): List of URLs to fetch
    
    Returns:
        Tuple of:
        - List of Langchain Documents
        - List of error messages
    """
    documents = []
    errors = []

    for url in urls:
        try:
            # Fetch webpage content
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            }
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()

            # Parse HTML
            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract main content (you might need to adjust this based on specific website structures)
            # This is a basic extraction - you may want to customize for specific sites
            main_content = soup.get_text(separator=' ', strip=True)

            # Create Langchain Document
            doc = Document(
                page_content=main_content,
                metadata={
                    "source": url,
                    "title": soup.title.string if soup.title else "No Title",
                }
            )
            
            documents.append(doc)

        except Exception as e:
            error_msg = f"Error fetching or processing {url}: {str(e)}"
            errors.append(error_msg)
            print(error_msg)

    return documents, errors

# Fetch and process documents
data, errors = fetch_and_process_urls(urls)

# Check and handle errors
if errors:
    for error in errors:
        print(error)
else:
    print(f"Successfully loaded {len(data)} documents.")

# If you want to use with further Langchain processing
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
# texts = text_splitter.split_documents(data)

Error fetching or processing https://www.business-standard.com/markets/news/c2c-advanced-systems-ipo-zooms-100-ahead-of-launch-key-details-from-rhp-124111900759_1.html: 403 Client Error: Forbidden for url: https://www.business-standard.com/markets/news/c2c-advanced-systems-ipo-zooms-100-ahead-of-launch-key-details-from-rhp-124111900759_1.html
Error fetching or processing https://www.moneycontrol.com/news/business/ipo/defense-electronics-solutions-provider-c2c-advanced-systems-ipo-to-open-for-subscription-on-november-22-12867910.html: 403 Client Error: Forbidden for url: https://www.moneycontrol.com/news/business/ipo/defense-electronics-solutions-provider-c2c-advanced-systems-ipo-to-open-for-subscription-on-november-22-12867910.html
Error fetching or processing https://www.business-standard.com/markets/news/c2c-advanced-systems-ipo-zooms-100-ahead-of-launch-key-details-from-rhp-124111900759_1.html: 403 Client Error: Forbidden for url: https://www.business-standard.com/markets/news/c2c-a

In [10]:
data

[Document(metadata={'source': 'https://www.ndtvprofit.com/ipos/c2c-advanced-systems-ipo-opens-on-nov-22-latest-gmp-indicates-2x-listing-gain-check-details', 'title': "C2C Advanced Systems IPO's GMP Indicates 2X Listing Gains Ahead Of Nov. 22 Opening"}, page_content="C2C Advanced Systems IPO's GMP Indicates 2X Listing Gains Ahead Of Nov. 22 Opening View All Search Results English Hindi Menu English Hindi Hello Reader Sign In / Register LIVE TV Latest Trending Business Business News Personal Finance ESG Investing Opinion Davos Markets Markets News Financial Terms Earnings IPOs Mutual Funds Premium Exclusive Profit Insights Research Reports Bloomberg View Premium Stories Media Videos Podcasts Economy Economy News Global Economy Law & Policy Law & Policy News Legal Library GST Technology Technology News Cryptocurrency Elections Our Products Learning Edge Newsletters More from NDTV Profit Sports Pursuits Nation World Politics Web Stories More from NDTV Profit Sports Pursuits Nation World Po

### (2) Split data to create chunks

In [11]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

# As data is of type documents we can directly use split_documents over split_text in order to get the chunks.
docs = text_splitter.split_documents(data)

In [12]:
len(docs)

64

In [13]:
docs[0]

Document(metadata={'source': 'https://www.ndtvprofit.com/ipos/c2c-advanced-systems-ipo-opens-on-nov-22-latest-gmp-indicates-2x-listing-gain-check-details', 'title': "C2C Advanced Systems IPO's GMP Indicates 2X Listing Gains Ahead Of Nov. 22 Opening"}, page_content="C2C Advanced Systems IPO's GMP Indicates 2X Listing Gains Ahead Of Nov. 22 Opening View All Search Results English Hindi Menu English Hindi Hello Reader Sign In / Register LIVE TV Latest Trending Business Business News Personal Finance ESG Investing Opinion Davos Markets Markets News Financial Terms Earnings IPOs Mutual Funds Premium Exclusive Profit Insights Research Reports Bloomberg View Premium Stories Media Videos Podcasts Economy Economy News Global Economy Law & Policy Law & Policy News Legal Library GST Technology Technology News Cryptocurrency Elections Our Products Learning Edge Newsletters More from NDTV Profit Sports Pursuits Nation World Politics Web Stories More from NDTV Profit Sports Pursuits Nation World Pol

### (3) Create embeddings for these chunks and save them to FAISS index

In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings

def get_embeddings():
    embeddings = HuggingFaceEmbeddings(
        model_name="all-MiniLM-L6-v2",
        model_kwargs={'device': 'cpu'}
    )
    return embeddings

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
embeddings = get_embeddings()
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={'device': 'cpu'}, encode_kwargs={}, multi_process=False, show_progress=False)

In [20]:
# Pass the documents and embeddings inorder to create FAISS vector index
vectorindex = FAISS.from_documents(docs, embeddings)
vectorindex

<langchain_community.vectorstores.faiss.FAISS at 0x22f5b835c90>

In [21]:
# Storing vector index create in local
file_path="vector_index.pkl"
with open(file_path, "wb") as f:
    pickle.dump(vectorindex, f)

In [22]:
if os.path.exists(file_path):
    with open(file_path, "rb") as f:
        vectorIndex = pickle.load(f)

### (4) Retrieve similar embeddings for a given question and call LLM to retrieve final answer

In [23]:
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorIndex.as_retriever())
chain



In [24]:
query = "what is the size of c2c ipo"
# query = "what are the main features of punch iCNG?"

langchain.debug=True

chain({"question": query}, return_only_outputs=True)

  chain({"question": query}, return_only_outputs=True)


[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "what is the size of c2c ipo"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "a fresh issue of 43.84 lakh shares. The company is preparing to make its debut on NSE SME. Retail investors need to invest a minimum of Rs 135,600. HNI investors require a minimum investment of Rs 271,200 for 1,200 shares, which is 2 lots. Link Intime India Pvt Ltd is acting as the registrar for the issue. Mark Corporate Advisors Pvt Ltd and Beeline Capital Advisors Pvt Ltd are the book running lead managers in this IPO. The market maker for the same is Spread X Securities. C2C Systems IPO GMP D

Token indices sequence length is longer than the specified maximum sequence length for this model (2057 > 1024). Running this sequence through the model will result in indexing errors


[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "what is the size of c2c ipo",
  "summaries": "Content: The size of the C2C Advanced Systems IPO is not explicitly stated in the provided text. However, it can be calculated using the given information.\n\nThe price band for the IPO is Rs 214 to Rs 226 per share, and the lot size is 600 shares. Therefore, the IPO size can be calculated as follows:\n\nMinimum IPO size = Price band low end * Lot size\nMinimum IPO size = Rs 214 * 600\nMinimum IPO size = Rs 128,400\n\nMaximum IPO size = Price band high end * Lot size\nMaximum IPO size = Rs 226 * 600\nMaximum IPO size = Rs 135,600\n\nTherefore, the size of the C2C Advanced Systems IPO is between Rs 128,400 and Rs 135,600.\nSource: https://www.equitypandit.com/c2c-advanced-systems-ipo-gmp-today-lot-size-issue-date-financials/\n\nContent: The NSE SME issue comprises an entir

{'answer': 'The size of the C2C Advanced Systems IPO is approximately Rs 9.37 - 10.01 crore, based on the fresh issuance of 43.84 lakh shares and the price band of Rs 214 to Rs 226 per share. However, there is another source that indicates the size of the IPO is over Rs 99 crore, and yet another source stating that the size of the IPO is Rs 28.23 crore, which is the amount expected to be contributed by anchor investors. Due to the inconsistency in the sources, it is difficult to provide a definitive answer to the question.\n\n',
 'sources': ''}