In [None]:
import requests
from bs4 import BeautifulSoup
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from typing import List, Dict, Union
import re
import nltk
from urllib.parse import urljoin, urlparse
import warnings
import time
from concurrent.futures import ThreadPoolExecutor
import pandas as pd

# Download all required NLTK data at startup
def download_nltk_data():
    try:
        nltk.download('punkt')
        nltk.download('averaged_perceptron_tagger')
        nltk.download('maxent_ne_chunker')
        nltk.download('words')
    except Exception as e:
        print(f"Warning: NLTK download failed: {str(e)}")

# Download NLTK data
download_nltk_data()

warnings.filterwarnings('ignore')

class WebsiteRAGSystem:
    def __init__(self):
        self.embed_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
        self.chunks = []
        self.embeddings = None
        self.index = None
        self.metadata = []
        self.visited_urls = set()
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }

    def simple_sentence_split(self, text: str) -> List[str]:
        """Fallback sentence splitter if NLTK fails"""
        # Split on common sentence endings
        splits = re.split(r'(?<=[.!?])\s+', text)
        return [s.strip() for s in splits if s.strip()]

    def get_sentences(self, text: str) -> List[str]:
        """Get sentences with fallback method"""
        try:
            return nltk.sent_tokenize(text)
        except Exception:
            return self.simple_sentence_split(text)

    def crawl_website(self, base_url: str, max_pages: int = 10) -> List[str]:
        urls_to_visit = [base_url]
        crawled_urls = []
        
        while urls_to_visit and len(crawled_urls) < max_pages:
            url = urls_to_visit.pop(0)
            if url in self.visited_urls:
                continue
                
            try:
                response = requests.get(url, headers=self.headers, timeout=10)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')
                    self.visited_urls.add(url)
                    crawled_urls.append(url)
                    
                    for link in soup.find_all('a', href=True):
                        new_url = urljoin(base_url, link['href'])
                        if (new_url.startswith(base_url) and 
                            new_url not in self.visited_urls and 
                            new_url not in urls_to_visit):
                            urls_to_visit.append(new_url)
                            
                time.sleep(1)
            except Exception as e:
                print(f"Error crawling {url}: {str(e)}")
                
        return crawled_urls

    def clean_text(self, text: str) -> str:
        """Clean and normalize text"""
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)
        # Remove special characters
        text = re.sub(r'[^\w\s.,!?-]', '', text)
        return text.strip()

    def extract_content(self, url: str) -> Dict:
        try:
            response = requests.get(url, headers=self.headers, timeout=10)
            if response.status_code != 200:
                return None
                
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Remove unwanted elements
            for element in soup.find_all(['script', 'style', 'nav', 'footer']):
                element.decompose()
                
            # Extract structured data
            structured_data = {}
            
            # Find and extract tables
            tables = []
            for table in soup.find_all('table'):
                try:
                    df = pd.read_html(str(table))[0]
                    tables.append(df.to_dict())
                except:
                    continue
            
            # Extract lists
            lists = []
            for list_elem in soup.find_all(['ul', 'ol']):
                items = [self.clean_text(item.get_text()) for item in list_elem.find_all('li')]
                if items:
                    lists.append(items)
            
            # Extract headings
            headings = []
            for heading in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
                heading_text = self.clean_text(heading.get_text())
                if heading_text:
                    headings.append(heading_text)
            
            # Extract main content
            text_content = self.clean_text(soup.get_text(separator=' '))
            
            # Extract metadata
            title = soup.title.string if soup.title else ''
            title = self.clean_text(title)
            
            meta_description = ''
            meta_description_tag = soup.find('meta', attrs={'name': 'description'})
            if meta_description_tag:
                meta_description = self.clean_text(meta_description_tag.get('content', ''))
                
            return {
                'url': url,
                'title': title,
                'meta_description': meta_description,
                'text_content': text_content,
                'tables': tables,
                'lists': lists,
                'headings': headings
            }
            
        except Exception as e:
            print(f"Error extracting content from {url}: {str(e)}")
            return None

    def process_content(self, content: Dict, chunk_size: int = 1000):
        if not content:
            return
            
        # Process headings and metadata first
        if content['headings']:
            heading_text = ' '.join(content['headings'])
            self.chunks.append(heading_text)
            self.metadata.append({
                'url': content['url'],
                'title': content['title'],
                'type': 'headings'
            })
            
        # Process main text content
        text = content['text_content']
        sentences = self.get_sentences(text)
        
        current_chunk = []
        current_size = 0
        
        for sentence in sentences:
            current_chunk.append(sentence)
            current_size += len(sentence)
            
            if current_size >= chunk_size:
                chunk_text = ' '.join(current_chunk)
                self.chunks.append(chunk_text)
                self.metadata.append({
                    'url': content['url'],
                    'title': content['title'],
                    'type': 'text'
                })
                current_chunk = []
                current_size = 0
                
        if current_chunk:
            chunk_text = ' '.join(current_chunk)
            self.chunks.append(chunk_text)
            self.metadata.append({
                'url': content['url'],
                'title': content['title'],
                'type': 'text'
            })
            
        # Process tables
        for table in content['tables']:
            table_str = str(table)
            self.chunks.append(table_str)
            self.metadata.append({
                'url': content['url'],
                'title': content['title'],
                'type': 'table'
            })
            
        # Process lists
        for list_items in content['lists']:
            list_str = '. '.join(list_items)
            self.chunks.append(list_str)
            self.metadata.append({
                'url': content['url'],
                'title': content['title'],
                'type': 'list'
            })

    def create_embeddings(self):
        if not self.chunks:
            return
            
        self.embeddings = self.embed_model.encode(self.chunks)
        dimension = self.embeddings.shape[1]
        self.index = faiss.IndexFlatL2(dimension)
        self.index.add(self.embeddings.astype('float32'))

    def search(self, query: str, k: int = 5) -> List[Dict]:
        query_embedding = self.embed_model.encode([query])
        distances, indices = self.index.search(query_embedding.astype('float32'), k)
        
        results = []
        for i, idx in enumerate(indices[0]):
            results.append({
                'chunk': self.chunks[idx],
                'metadata': self.metadata[idx],
                'score': float(distances[0][i])
            })
        return results

    def format_response(self, query: str, relevant_chunks: List[Dict]) -> str:
        response = f"Query: {query}\n\nRelevant information:\n\n"
        
        # Group chunks by URL
        url_chunks = {}
        for chunk in relevant_chunks:
            url = chunk['metadata']['url']
            if url not in url_chunks:
                url_chunks[url] = []
            url_chunks[url].append(chunk)
        
        # Format response by URL
        for url, chunks in url_chunks.items():
            response += f"From {url}:\n"
            response += f"Title: {chunks[0]['metadata']['title']}\n\n"
            
            # Sort chunks by type
            type_order = {'headings': 1, 'list': 2, 'text': 3, 'table': 4}
            chunks.sort(key=lambda x: type_order.get(x['metadata']['type'], 999))
            
            for chunk in chunks:
                chunk_type = chunk['metadata']['type']
                content = chunk['chunk']
                
                if chunk_type == 'headings':
                    response += f"Headings:\n{content}\n\n"
                elif chunk_type == 'list':
                    response += f"List items:\n{content}\n\n"
                elif chunk_type == 'text':
                    response += f"Content:\n{content}\n\n"
                elif chunk_type == 'table':
                    response += f"Table data:\n{content}\n\n"
                    
            response += "-" * 80 + "\n"
            
        return response

    def get_answer(self, query: str) -> str:
        relevant_chunks = self.search(query)
        if not relevant_chunks:
            return "I couldn't find relevant information to answer your question."
            
        return self.format_response(query, relevant_chunks)

def main():
    print("Welcome to the Website RAG System!")
    print("This system will help you extract and query information from websites.")
    
    rag_system = WebsiteRAGSystem()
    
    while True:
        print("\nEnter a website URL to analyze (or 'quit' to exit):")
        url = input().strip()
        
        if url.lower() == 'quit':
            break
            
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url
            
        try:
            print(f"\nCrawling {url}...")
            crawled_urls = rag_system.crawl_website(url)
            print(f"Found {len(crawled_urls)} pages")
            
            print("Extracting and processing content...")
            for url in crawled_urls:
                content = rag_system.extract_content(url)
                if content:
                    rag_system.process_content(content)
                    
            print("Creating embeddings...")
            rag_system.create_embeddings()
            
            print("\nYou can now ask questions about the website!")
            while True:
                print("\nEnter your question (or 'new' for new website, 'quit' to exit):")
                query = input().strip()
                
                if query.lower() == 'quit':
                    return
                if query.lower() == 'new':
                    break
                    
                answer = rag_system.get_answer(query)
                print("\nAnswer:", answer)
                
        except Exception as e:
            print(f"Error processing website: {str(e)}")
            print("Please try a different website or check your internet connection.")

if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\nProgram terminated by user. Goodbye!")
    except Exception as e:
        print(f"\nAn unexpected error occurred: {str(e)}")


  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ursti\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ursti\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\ursti\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\ursti\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


Welcome to the Website RAG System!
This system will help you extract and query information from websites.

Enter a website URL to analyze (or 'quit' to exit):
hitam.org

Crawling https://hitam.org...
Found 10 pages
Extracting and processing content...
Creating embeddings...

You can now ask questions about the website!

Enter your question (or 'new' for new website, 'quit' to exit):
what is the website used for

Answer: Query: what is the website used for

Relevant information:

From https://hitam.org/international-partnerships/:
Title: INTERNATIONAL PARTNERSHIPS  Hyderabad Institute of Technology and Management

List items:
It builds the academic strengths of the students with an emphasis on hands-on learning and research collaborations.. It helps to approach an interdisciplinary education allowing students to explore diverse fields and broaden their perspectives.. It equips students with relevant skills to align with industry demands.. It offers workshops, seminars, and conferences, 

In [None]:
# import requests
# from bs4 import BeautifulSoup
# import numpy as np
# from sentence_transformers import SentenceTransformer
# import faiss
# from typing import List, Dict, Union
# import re
# import nltk
# from urllib.parse import urljoin, urlparse
# import warnings
# import time
# from concurrent.futures import ThreadPoolExecutor
# import pandas as pd

# # Download required NLTK data
# try:
#     nltk.data.find('tokenizers/punkt')
# except LookupError:
#     nltk.download('punkt')

# warnings.filterwarnings('ignore')

# class WebsiteRAGSystem:
#     def __init__(self):
#         self.embed_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
#         self.chunks = []
#         self.embeddings = None
#         self.index = None
#         self.metadata = []
#         self.visited_urls = set()
#         self.headers = {
#             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
#         }

#     def crawl_website(self, base_url: str, max_pages: int = 10) -> List[str]:
#         urls_to_visit = [base_url]
#         crawled_urls = []
        
#         while urls_to_visit and len(crawled_urls) < max_pages:
#             url = urls_to_visit.pop(0)
#             if url in self.visited_urls:
#                 continue
                
#             try:
#                 response = requests.get(url, headers=self.headers, timeout=10)
#                 if response.status_code == 200:
#                     soup = BeautifulSoup(response.text, 'html.parser')
#                     self.visited_urls.add(url)
#                     crawled_urls.append(url)
                    
#                     for link in soup.find_all('a', href=True):
#                         new_url = urljoin(base_url, link['href'])
#                         if (new_url.startswith(base_url) and 
#                             new_url not in self.visited_urls and 
#                             new_url not in urls_to_visit):
#                             urls_to_visit.append(new_url)
                            
#                 time.sleep(1)
#             except Exception as e:
#                 print(f"Error crawling {url}: {str(e)}")
                
#         return crawled_urls

#     def extract_content(self, url: str) -> Dict:
#         try:
#             response = requests.get(url, headers=self.headers, timeout=10)
#             if response.status_code != 200:
#                 return None
                
#             soup = BeautifulSoup(response.text, 'html.parser')
            
#             # Remove unwanted elements
#             for element in soup.find_all(['script', 'style', 'nav', 'footer']):
#                 element.decompose()
                
#             # Extract structured data
#             structured_data = {}
            
#             # Find and extract tables
#             tables = []
#             for table in soup.find_all('table'):
#                 try:
#                     df = pd.read_html(str(table))[0]
#                     tables.append(df.to_dict())
#                 except:
#                     continue
            
#             # Extract lists
#             lists = []
#             for list_elem in soup.find_all(['ul', 'ol']):
#                 items = [item.get_text(strip=True) for item in list_elem.find_all('li')]
#                 lists.append(items)
            
#             # Extract headings
#             headings = []
#             for heading in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
#                 headings.append(heading.get_text(strip=True))
            
#             # Extract main content
#             text_content = soup.get_text(separator=' ', strip=True)
            
#             # Extract metadata
#             title = soup.title.string if soup.title else ''
#             meta_description = ''
#             meta_description_tag = soup.find('meta', attrs={'name': 'description'})
#             if meta_description_tag:
#                 meta_description = meta_description_tag.get('content', '')
                
#             return {
#                 'url': url,
#                 'title': title,
#                 'meta_description': meta_description,
#                 'text_content': text_content,
#                 'tables': tables,
#                 'lists': lists,
#                 'headings': headings
#             }
            
#         except Exception as e:
#             print(f"Error extracting content from {url}: {str(e)}")
#             return None

#     def process_content(self, content: Dict, chunk_size: int = 1000):
#         if not content:
#             return
            
#         # Process headings and metadata first
#         if content['headings']:
#             heading_text = ' '.join(content['headings'])
#             self.chunks.append(heading_text)
#             self.metadata.append({
#                 'url': content['url'],
#                 'title': content['title'],
#                 'type': 'headings'
#             })
            
#         # Process main text content
#         text = content['text_content']
#         sentences = nltk.sent_tokenize(text)
        
#         current_chunk = []
#         current_size = 0
        
#         for sentence in sentences:
#             current_chunk.append(sentence)
#             current_size += len(sentence)
            
#             if current_size >= chunk_size:
#                 chunk_text = ' '.join(current_chunk)
#                 self.chunks.append(chunk_text)
#                 self.metadata.append({
#                     'url': content['url'],
#                     'title': content['title'],
#                     'type': 'text'
#                 })
#                 current_chunk = []
#                 current_size = 0
                
#         if current_chunk:
#             chunk_text = ' '.join(current_chunk)
#             self.chunks.append(chunk_text)
#             self.metadata.append({
#                 'url': content['url'],
#                 'title': content['title'],
#                 'type': 'text'
#             })
            
#         # Process tables
#         for table in content['tables']:
#             table_str = str(table)
#             self.chunks.append(table_str)
#             self.metadata.append({
#                 'url': content['url'],
#                 'title': content['title'],
#                 'type': 'table'
#             })
            
#         # Process lists
#         for list_items in content['lists']:
#             list_str = '. '.join(list_items)
#             self.chunks.append(list_str)
#             self.metadata.append({
#                 'url': content['url'],
#                 'title': content['title'],
#                 'type': 'list'
#             })

#     def create_embeddings(self):
#         if not self.chunks:
#             return
            
#         self.embeddings = self.embed_model.encode(self.chunks)
#         dimension = self.embeddings.shape[1]
#         self.index = faiss.IndexFlatL2(dimension)
#         self.index.add(self.embeddings.astype('float32'))

#     def search(self, query: str, k: int = 5) -> List[Dict]:
#         query_embedding = self.embed_model.encode([query])
#         distances, indices = self.index.search(query_embedding.astype('float32'), k)
        
#         results = []
#         for i, idx in enumerate(indices[0]):
#             results.append({
#                 'chunk': self.chunks[idx],
#                 'metadata': self.metadata[idx],
#                 'score': float(distances[0][i])
#             })
#         return results

#     def format_response(self, query: str, relevant_chunks: List[Dict]) -> str:
#         response = f"Query: {query}\n\nRelevant information:\n\n"
        
#         # Group chunks by URL
#         url_chunks = {}
#         for chunk in relevant_chunks:
#             url = chunk['metadata']['url']
#             if url not in url_chunks:
#                 url_chunks[url] = []
#             url_chunks[url].append(chunk)
        
#         # Format response by URL
#         for url, chunks in url_chunks.items():
#             response += f"From {url}:\n"
#             response += f"Title: {chunks[0]['metadata']['title']}\n\n"
            
#             # Sort chunks by type (headings first, then lists, then text, then tables)
#             type_order = {'headings': 1, 'list': 2, 'text': 3, 'table': 4}
#             chunks.sort(key=lambda x: type_order.get(x['metadata']['type'], 999))
            
#             for chunk in chunks:
#                 chunk_type = chunk['metadata']['type']
#                 content = chunk['chunk']
                
#                 if chunk_type == 'headings':
#                     response += f"Headings:\n{content}\n\n"
#                 elif chunk_type == 'list':
#                     response += f"List items:\n{content}\n\n"
#                 elif chunk_type == 'text':
#                     response += f"Content:\n{content}\n\n"
#                 elif chunk_type == 'table':
#                     response += f"Table data:\n{content}\n\n"
                    
#             response += "-" * 80 + "\n"
            
#         return response

#     def get_answer(self, query: str) -> str:
#         relevant_chunks = self.search(query)
#         if not relevant_chunks:
#             return "I couldn't find relevant information to answer your question."
            
#         return self.format_response(query, relevant_chunks)

# def main():
#     print("Welcome to the Website RAG System!")
#     print("This system will help you extract and query information from websites.")
    
#     rag_system = WebsiteRAGSystem()
    
#     while True:
#         print("\nEnter a website URL to analyze (or 'quit' to exit):")
#         url = input().strip()
        
#         if url.lower() == 'quit':
#             break
            
#         if not url.startswith(('http://', 'https://')):
#             url = 'https://' + url
            
#         try:
#             print(f"\nCrawling {url}...")
#             crawled_urls = rag_system.crawl_website(url)
#             print(f"Found {len(crawled_urls)} pages")
            
#             print("Extracting and processing content...")
#             for url in crawled_urls:
#                 content = rag_system.extract_content(url)
#                 if content:
#                     rag_system.process_content(content)
                    
#             print("Creating embeddings...")
#             rag_system.create_embeddings()
            
#             print("\nYou can now ask questions about the website!")
#             while True:
#                 print("\nEnter your question (or 'new' for new website, 'quit' to exit):")
#                 query = input().strip()
                
#                 if query.lower() == 'quit':
#                     return
#                 if query.lower() == 'new':
#                     break
                    
#                 answer = rag_system.get_answer(query)
#                 print("\nAnswer:", answer)
                
#         except Exception as e:
#             print(f"Error processing website: {str(e)}")
#             print("Please try a different website or check your internet connection.")

# if __name__ == "__main__":
#     try:
#         main()
#     except KeyboardInterrupt:
#         print("\nProgram terminated by user. Goodbye!")
#     except Exception as e:
#         print(f"\nAn unexpected error occurred: {str(e)}")
