In [1]:
import sys
import langchain
import json

### Get data from the target website

In [2]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

class CustomScraper:
    def __init__(self, base_url, storage_dir="./pages"):
        self.base_url = base_url
        self.storage_dir = storage_dir
        self.ensure_storage_dir_exists()

    def ensure_storage_dir_exists(self):
        """Ensures the storage directory exists."""
        if not os.path.exists(self.storage_dir):
            os.mkdir(self.storage_dir)

    def format_url_for_saving(self, url):
        """Adjust URL to a safe filename format."""
        clean_url = url.replace("https://", "").replace("http://", "")
        forbidden_chars = ['/', '\\', ':', '*', '?', '"', '<', '>', '|']
        for char in forbidden_chars:
            clean_url = clean_url.replace(char, "_")
        return clean_url

    def fetch_page_and_store(self, url, filename=None):
        """Fetch a page content and store it locally."""
        try:
            result = requests.get(url)
            if filename is None:
                filename = self.format_url_for_saving(url)
            filepath = os.path.join(self.storage_dir, f"{filename}.html")
            with open(filepath, "wb") as file:
                file.write(result.content)
            print(f"Downloaded: {url} -> {filepath}")
        except requests.exceptions.RequestException as e:
            print(f"Failed to download {url}: {e}")

    def scrape_and_download_links(self):
        """Scrape all links on the base page and download their content."""
        try:
            response = requests.get(self.base_url)
            soup = BeautifulSoup(response.content, "html.parser")
            links = soup.find_all('a')
            
            for link in links:
                href = link.get('href')
                if href and not href.startswith('#'):
                    full_url = urljoin(self.base_url, href)
                    self.fetch_page_and_store(full_url)
        except requests.exceptions.RequestException as e:
            print(f"Failed to access {self.base_url}: {e}")

# Example usage
if __name__ == "__main__":
    base_url1 = "http://support.apexsystemsinc.com/kb/faq.php?cid=1"
    scraper = CustomScraper(base_url1)
    scraper.scrape_and_download_links()

    base_url2 = "http://support.apexsystemsinc.com/kb/faq.php?cid=2"
    scraper = CustomScraper(base_url2)
    scraper.scrape_and_download_links()



Downloaded: http://support.apexsystemsinc.com/index.php -> ./pages/support.apexsystemsinc.com_index.php.html
Downloaded: http://support.apexsystemsinc.com/index.php -> ./pages/support.apexsystemsinc.com_index.php.html
Downloaded: http://support.apexsystemsinc.com/kb/index.php -> ./pages/support.apexsystemsinc.com_kb_index.php.html
Downloaded: http://support.apexsystemsinc.com/open.php -> ./pages/support.apexsystemsinc.com_open.php.html
Downloaded: http://support.apexsystemsinc.com/view.php -> ./pages/support.apexsystemsinc.com_view.php.html
Downloaded: http://support.apexsystemsinc.com/kb/faq.php?id=1 -> ./pages/support.apexsystemsinc.com_kb_faq.php_id=1.html
Downloaded: http://support.apexsystemsinc.com/kb/faq.php?id=11 -> ./pages/support.apexsystemsinc.com_kb_faq.php_id=11.html
Downloaded: http://support.apexsystemsinc.com/kb/faq.php?id=12 -> ./pages/support.apexsystemsinc.com_kb_faq.php_id=12.html
Downloaded: http://support.apexsystemsinc.com/kb/faq.php?id=13 -> ./pages/support.apex

In [3]:
import os
from bs4 import BeautifulSoup
import pandas as pd

class DataExtractor:
    def __init__(self, storage_dir):
        self.storage_dir = storage_dir

    def _extract_link_from_filename(self, filename):
        # Reconstruct the original link from the saved filename
        base_url = "http://support.apexsystemsinc.com/"
        parts = filename.split('_')
        reconstructed_path = '/'.join(parts[1:]).replace('.html', '').replace('_', '=').replace('-', '&')
        return base_url + reconstructed_path

    def _parse_html_file(self, filepath):
        with open(filepath, 'r', encoding='utf-8') as file:
            title_text =''
            info_text=''
            soup = BeautifulSoup(file, 'html.parser')
            
            # Extracting the article title
            title = soup.find('div', class_='article-title')
            info = soup.find('div', class_='thread-body')
            if title and info:
                title_text = title.get_text(strip=True)
                info_text = ' '.join(info.stripped_strings)
            
            return title_text, info_text

    def extract_data_to_dataframe(self):
        data = []
        for filename in os.listdir(self.storage_dir):
            if not filename.endswith(".html"):
                continue
                
            filepath = os.path.join(self.storage_dir, filename)
            title, info = self._parse_html_file(filepath)
            if info == '':
                continue
            link = self._extract_link_from_filename(filename)
            
            data.append({
                'Article Title': title,
                'Information': info,
                'Link': link
            })
        
        return pd.DataFrame(data)

# Usage
storage_dir = './pages'  # Update this path to where your HTML files are stored
extractor = DataExtractor(storage_dir)
df = extractor.extract_data_to_dataframe()
print(df)

                                        Article Title  \
0                                Create an TA Account   
1                                  Unauthorized Error   
2             Entering Expenses & Submitting Reciepts   
3                                      Direct Deposit   
4                                   Time Portal Links   
5                           Correct Contractors Hours   
6                            My Apex Mobile App Guide   
7   Creating a My Apex Account/ Already Registered...   
8                                      Updating Email   
9                                   Zero Hours Worked   
10                          Setting Up New Primary TA   
11  Updating Timecard (Accidental Submittal, Corre...   
12                      Updating Personal Information   
13              Setting up an Alternate Time Approver   
14                                          W4 Update   
15                         Unauthorized Error Message   
16                            C

In [4]:
from langchain_community.document_loaders import DataFrameLoader
loader = DataFrameLoader(df, page_content_column="Information")

In [5]:
documents = loader.load()

In [6]:
from langchain import OpenAI
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from langchain.chains.conversational_retrieval.prompts import CONDENSE_QUESTION_PROMPT, QA_PROMPT
from langchain.chains.llm import LLMChain
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Pinecone

from dotenv import load_dotenv, find_dotenv



In [7]:
load_dotenv(find_dotenv())
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings(model='text-embedding-3-small')

  warn_deprecated(


In [8]:
import pinecone
pc = pinecone.Pinecone(api_key=os.getenv('PINECONE_API_KEY'),environment=os.getenv('PINECONE_ENV'))
index_name = "apex"

  from tqdm.autonotebook import tqdm


In [9]:
docsearch = Pinecone.from_documents(docs, embeddings, index_name=index_name)

In [10]:
# initialize the LLM
llm = OpenAI(model_name="gpt-4", temperature=0)
# the non-streaming LLM for questions
question_generator = LLMChain(llm=llm, prompt=CONDENSE_QUESTION_PROMPT)



In [11]:
# astreaming llm for the docs
streaming_llm = OpenAI(
    streaming=True, 
    callback_manager=CallbackManager([
        StreamingStdOutCallbackHandler()
    ]), 
    verbose=True,
    temperature=0
)
doc_chain = load_qa_chain(streaming_llm, chain_type="stuff", prompt=QA_PROMPT)

# initialize ConversationalRetrievalChain chabot
qa = ConversationalRetrievalChain(
    retriever=docsearch.as_retriever(), combine_docs_chain=doc_chain, question_generator=question_generator)

  warn_deprecated(


In [14]:
# create an array to store the chat history.
chat_history = []
question = input("Hi! Ask me a question about Apex FAQ. ")

# create a loop to ask the chatbot questions 
while True:
    result = qa(
        {"question": question, "chat_history": chat_history}
    )
    print("\n")
    chat_history.append((result["question"], result["answer"]))
    question = input()

 To get your W2 online, please follow the steps outlined in the context provided. If you need assistance with updating your W4, please log into your My Apex Time Management account and follow the instructions provided. If you have any further questions or need additional help, please reach out to Employee Services at [email protected] or call 866 923 2739.