In [1]:
### Lainchain + Llama 3.1


In [3]:
import argparse

from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Embed and store
from langchain.vectorstores import Chroma
from langchain.embeddings import GPT4AllEmbeddings
from langchain.embeddings import OllamaEmbeddings # We can also try Ollama embeddings

from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import requests

### Retreive Article Links Per Industry-Sub-industry

In [6]:
# Get Relevant Links for Each Indsutry
def get_deloitte_url(industry, sub_industry=None):
    urls = {
        "Energy, Resources & Industrials": {
            "focus_link": "https://www.deloitte.com/global/en/Industries/energy-resources-industrials.html",
            "Mining & Metals": [
                "https://www.deloitte.com/global/en/Industries/mining-metals/about.html?icid=ln_energy,resources-industrials_about",
                "https://www.deloitte.com/global/en/Industries/mining-metals.html"
            ],
            "Industrial Construction": [
                "https://www.deloitte.com/global/en/Industries/industrial-construction/about.html?icid=ln_energy,resources-industrials_about",
                "https://www.deloitte.com/global/en/Industries/industrial-construction.html"
            ],
            "Energy & Chemicals": [
                "https://www.deloitte.com/global/en/Industries/energy-chemicals/about.html?icid=ln_energy,resources-industrials_about",
                "https://www.deloitte.com/global/en/Industries/energy-chemicals.html"
            ],
            "Power, Utilities & Renewables": [
                "https://www.deloitte.com/global/en/Industries/power-utilities-renewables/about.html?icid=ln_energy,resources-industrials_about",
                "https://www.deloitte.com/global/en/Industries/power-utilities-renewables.html"
            ]
        },
        "Financial Services": {
            "focus_link": "https://www.deloitte.com/global/en/Industries/financial-services.html",
            "Banking & Capital Markets": [
                "https://www.deloitte.com/global/en/Industries/banking-capital-markets/about.html?icid=ln_financialservices_about",
                "https://www.deloitte.com/global/en/Industries/banking-capital-markets.html"
            ],
            "Insurance": [
                "https://www.deloitte.com/global/en/Industries/insurance/about.html?icid=ln_financialservices_about",
                "https://www.deloitte.com/global/en/Industries/insurance.html"
            ],
            "Investment Management": [
                "https://www.deloitte.com/global/en/Industries/investment-management/about.html?icid=ln_financialservices_about",
                "https://www.deloitte.com/global/en/Industries/investment-management.html"
            ],
            "Real Estate": [
                "https://www.deloitte.com/global/en/Industries/real-estate/about.html?icid=ln_financialservices_about",
                "https://www.deloitte.com/global/en/Industries/real-estate.html"
            ]
        },
        "Government and Public Services": {
            "focus_link": "https://www.deloitte.com/global/en/Industries/government-public-services.html",
            "Central Government": [
                "https://www.deloitte.com/global/en/Industries/central-government/about.html?icid=ln_government-publicservices_about",
                "https://www.deloitte.com/global/en/Industries/central-government.html"
            ],
            "Defense, Security & Justice": [
                "https://www.deloitte.com/global/en/Industries/defense-security-justice/about.html?icid=ln_government-publicservices_about",
                "https://www.deloitte.com/global/en/Industries/defense-security-justice.html"
            ],
            "Health & Human Services": [
                "https://www.deloitte.com/global/en/Industries/health-human-services/about.html?icid=ln_government-publicservices_about",
                "https://www.deloitte.com/global/en/Industries/health-human-services.html"
            ],
            "Infrastructure": [
                "https://www.deloitte.com/global/en/Industries/infrastructure/about.html?icid=ln_government-publicservices_about",
                "https://www.deloitte.com/global/en/Industries/infrastructure.html"
            ]
        },
        "Life Sciences & Healthcare": {
            "focus_link": "https://www.deloitte.com/global/en/Industries/life-sciences-healthcare.html",
            "Health Care": [
                "https://www.deloitte.com/global/en/Industries/health-care/about.html?icid=ln_lifesciences-healthcare_about",
                "https://www.deloitte.com/global/en/Industries/health-care.html"
            ],
            "Life Sciences": [
                "https://www.deloitte.com/global/en/Industries/life-sciences/about.html?icid=ln_lifesciences-healthcare_about",
                "https://www.deloitte.com/global/en/Industries/life-sciences.html"
            ]
        },
        "Technology, Media, & Telecommunications": {
            "focus_link": "https://www.deloitte.com/global/en/Industries/technology-media-telecommunications.html",
            "Technology": [
                "https://www.deloitte.com/global/en/Industries/technology/about.html?icid=ln_technology,media-telecommunications_about",
                "https://www.deloitte.com/global/en/Industries/technology.html"
            ],
            "Telecom, Media & Entertainment": [
                "https://www.deloitte.com/global/en/Industries/telecom-media-entertainment/about.html?icid=ln_technology,media-telecommunications_about",
                "https://www.deloitte.com/global/en/Industries/telecom-media-entertainment.html"
            ],
            "Sports Hub": [
                "https://www.deloitte.com/global/en/Industries/telecom-media-entertainment/collections/sports-hub.html?icid=ln_technology,media-telecommunications_sports-hub",
                "https://www.deloitte.com/global/en/Industries/telecom-media-entertainment/collections/sports-hub.html"
            ],
            "Semiconductor": [
                "https://www.deloitte.com/global/en/Industries/telecom-media-entertainment/collections/semiconductor.html?icid=ln_technology,media-telecommunications_semiconductor",
                "https://www.deloitte.com/global/en/Industries/telecom-media-entertainment/collections/semiconductor.html"
            ]
        }
    }

    try:
        if sub_industry:
            return urls[industry][sub_industry]
        else:
            return urls[industry]["focus_link"]
    except KeyError:
        return "Invalid industry or sub-industry. Please check your inputs."
    




In [14]:
## Remove other sub-industry links than the one selected
def remove_sub_links(article_links, industry):
    main_sub_links = {
    "Energy, Resources & Industrials": {
        "Mining & Metals": "https://www.deloitte.com/global/en/Industries/mining-metals.html",
        "Industrial Construction": "https://www.deloitte.com/global/en/Industries/industrial-construction.html",
        "Energy & Chemicals": "https://www.deloitte.com/global/en/Industries/energy-chemicals.html",
        "Power, Utilities & Renewables": "https://www.deloitte.com/global/en/Industries/power-utilities-renewables.html"
    },
    "Financial Services": {
        "Banking & Capital Markets": "https://www.deloitte.com/global/en/Industries/banking-capital-markets.html",
        "Insurance": "https://www.deloitte.com/global/en/Industries/insurance.html",
        "Investment Management": "https://www.deloitte.com/global/en/Industries/investment-management.html",
        "Real Estate": "https://www.deloitte.com/global/en/Industries/real-estate.html"
    },
    "Government and Public Services": {
        "Central Government": "https://www.deloitte.com/global/en/Industries/central-government.html",
        "Defense, Security & Justice": "https://www.deloitte.com/global/en/Industries/defense-security-justice.html",
        "Health & Human Services": "https://www.deloitte.com/global/en/Industries/health-human-services.html",
        "Infrastructure": "https://www.deloitte.com/global/en/Industries/infrastructure.html"
    },
    "Life Sciences & Healthcare": {
        "Health Care": "https://www.deloitte.com/global/en/Industries/health-care.html",
        "Life Sciences": "https://www.deloitte.com/global/en/Industries/life-sciences.html"
    },
    "Technology, Media, & Telecommunications": {
        "Technology": "https://www.deloitte.com/global/en/Industries/technology.html",
        "Telecom, Media & Entertainment": "https://www.deloitte.com/global/en/Industries/telecom-media-entertainment.html",
        "Sports Hub": "https://www.deloitte.com/global/en/Industries/telecom-media-entertainment/collections/sports-hub.html",
        "Semiconductor": "https://www.deloitte.com/global/en/Industries/telecom-media-entertainment/collections/semiconductor.html"
    }
    }


    about_links = {
    "Energy, Resources & Industrials": {
        "Mining & Metals": "https://www.deloitte.com/global/en/Industries/mining-metals/about.html",
        "Industrial Construction": "https://www.deloitte.com/global/en/Industries/industrial-construction/about.html",
        "Energy & Chemicals": "https://www.deloitte.com/global/en/Industries/energy-chemicals/about.html",
        "Power, Utilities & Renewables": "https://www.deloitte.com/global/en/Industries/power-utilities-renewables/about.html"
    },
    "Financial Services": {
        "Banking & Capital Markets": "https://www.deloitte.com/global/en/Industries/banking-capital-markets/about.html",
        "Insurance": "https://www.deloitte.com/global/en/Industries/insurance.html",
        "Investment Management": "https://www.deloitte.com/global/en/Industries/investment-management/about.html",
        "Real Estate": "https://www.deloitte.com/global/en/Industries/real-estate/about.html"
    },
    "Government and Public Services": {
        "Central Government": "https://www.deloitte.com/global/en/Industries/central-government/about.html",
        "Defense, Security & Justice": "https://www.deloitte.com/global/en/Industries/defense-security-justice/about.html",
        "Health & Human Services": "https://www.deloitte.com/global/en/Industries/health-human-services/about.html",
        "Infrastructure": "https://www.deloitte.com/global/en/Industries/infrastructure/about.html"
    },
    "Life Sciences & Healthcare": {
        "Health Care": "https://www.deloitte.com/global/en/Industries/health-care/about.html",
        "Life Sciences": "https://www.deloitte.com/global/en/Industries/life-sciences/about.html"
    },
    "Technology, Media, & Telecommunications": {
        "Technology": "https://www.deloitte.com/global/en/Industries/technology/about.html",
        "Telecom, Media & Entertainment": "https://www.deloitte.com/global/en/Industries/telecom-media-entertainment/about.html",
        "Sports Hub": "https://www.deloitte.com/global/en/Industries/telecom-media-entertainment/collections/sports-hub/about.html",
        "Semiconductor": "https://www.deloitte.com/global/en/Industries/telecom-media-entertainment/collections/semiconductor/about.html"
    }
    }

    sub_about_links = about_links[industry].values()
    sub_industry_links = main_sub_links[industry].values()

    unwanted_links_set = set(list(sub_industry_links) + list(sub_about_links))
    article_links_set = set(article_links)

    # Remove unwanted links using set difference
    filtered_links_set = article_links_set - unwanted_links_set

    # Convert the set back to a list if needed
    filtered_links = list(filtered_links_set)

    return filtered_links

In [17]:

# Get all the possible Article url listed on a page
def fetch_article_links(url):
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "a"))
        )
        page_content = driver.page_source
    finally:
        driver.quit()

    soup = BeautifulSoup(page_content, 'html.parser')
    links = soup.find_all('a', class_='cmp-promo-tracking cmp-promo-curated')
    
    article_links = ['https://www.deloitte.com' + link.get('href') for link in links if link.get('href')]

    industries_links = list(filter(lambda link: 'Industries' in link, article_links))
    

    return industries_links

fetch_article_links('https://www.deloitte.com/global/en/Industries/insurance.html')



['https://www.deloitte.com/global/en/Industries/life-sciences-health-care/perspectives/future-of-health/the-health-equity-institutes.html',
 'https://www.deloitte.com/global/en/Industries/financial-services/perspectives/global-ifrs17-insurance-survey.html',
 'https://www.deloitte.com/global/en/Industries/financial-services/analysis/impact-of-covid-19-on-savings.html',
 'https://www.deloitte.com/global/en/Industries/financial-services/about/insurance-ifrs.html',
 'https://www.deloitte.com/global/en/Industries/financial-services/analysis/the-future-of-small-business-insurance.html',
 'https://www.deloitte.com/global/en/Industries/banking-capital-markets.html',
 'https://www.deloitte.com/global/en/Industries/investment-management.html',
 'https://www.deloitte.com/global/en/Industries/real-estate.html']

### Obtain Content of Articles

In [16]:
# Define the Document class (assuming a simple structure for demonstration)
class Document:
    def __init__(self, metadata, page_content):
        self.metadata = metadata
        self.page_content = page_content 
        
# Function to extract main content from HTML
def extract_main_content(page_content):
    soup = BeautifulSoup(page_content, 'html.parser')
    
    # Try to find a specific section that contains the main content
    main_section = soup.find('div', {'class': 'main-content'})
    
    if not main_section:
        main_section = soup.find('article')
    
    if not main_section:
        main_section = soup
    
    headers = [header.get_text(strip=True) for header in main_section.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]
    paragraphs = [p.get_text(strip=True) for p in main_section.find_all('p')]
    return headers + paragraphs

# Function to get articles
def get_content(industry, sub_industry):
    urls = list(set(get_deloitte_url(industry, sub_industry) + [get_deloitte_url(industry)]))
    if "Invalid" in urls:
        print(urls)
        return

    # print(f"Using URL: {urls}")

    # Fetch article links
    article_links = list(map(lambda url: fetch_article_links(url), urls))
    article_links = list({url for sublist in article_links for url in sublist})
    article_links = remove_sub_links(article_links, industry)
    


    # List to store the results for each article
    articles = []

    for article_url in article_links:
        print(article_url)
        response = requests.get(article_url)
        if response.status_code == 200:
            document = response.text
            soup = BeautifulSoup(document, 'html.parser')
            
            # Extract metadata
            title = soup.title.string if soup.title else 'No title'
            content = extract_main_content(document)
            
            articles.append({
                'link': article_url,
                'title': title,
                'content': content
            })
        else:
            print(f"Failed to load URL: {article_url}")

    # Display the results
    for article in articles:
        print(f"Link: {article['link']}")
        print(f"Title: {article['title']}\n")
        print("Content:\n")
        for item in article['content']:
            print(item)
        print("\n" + "-"*80 + "\n")

   

    return articles

#Example usage
industry = "Financial Services"
sub_industry = "Insurance"
industry_content = get_content(industry, sub_industry)
industry_content

https://www.deloitte.com/global/en/Industries/financial-services/analysis/the-future-of-small-business-insurance.html
https://www.deloitte.com/global/en/Industries/real-estate/about.html
https://www.deloitte.com/global/en/Industries/financial-services/perspectives/global-ifrs17-insurance-survey.html
https://www.deloitte.com/global/en/Industries/financial-services/perspectives/ecosystem-imperative.html
https://www.deloitte.com/global/en/Industries/financial-services/research/building-a-future-ready-investment-firm.html
https://www.deloitte.com/global/en/Industries/financial-services/perspectives/underwriting-our-planet.html
https://www.deloitte.com/global/en/Industries/financial-services/perspectives/generative-ai-in-insurance.html
https://www.deloitte.com/global/en/Industries/financial-services/analysis/impact-of-covid-19-on-savings.html
https://www.deloitte.com/global/en/Industries/financial-services/perspectives/pushing-through-undercurrents.html
https://www.deloitte.com/global/en/In

[{'link': 'https://www.deloitte.com/global/en/Industries/financial-services/analysis/the-future-of-small-business-insurance.html',
  'title': 'The Future of Small Business Insurance',
  'content': ['World Impact | Deloitte Global',
   'Our history',
   'Sustainability and Climate',
   'Operate services',
   'The Deloitte Global Boardroom Program',
   'The Deloitte Health Equity Institutes',
   'Inclusion at Deloitte',
   'WorldClass: Empowering 100 million people',
   'The future of small business insurance',
   'What do customers want?',
   'Thanks for your feedback',
   'Your feedback is important to us',
   'Get in touch',
   'Mark Patterson',
   'Technology Driven Systemic Risks and the Continued Need for Innovation',
   'IFRS 17',
   'Technology Driven Systemic Risks and the Continued Need for Innovation',
   'The Evolution Of The Payments Value Model',
   "Let's connect",
   'Follow us',
   '',
   '',
   'Select your location',
   '',
   'No results found',
   'Small business ins

### Apply Ollama and Langchain For Information Retrieval for Selected Industry

In [18]:
industry_content

[{'link': 'https://www.deloitte.com/global/en/Industries/financial-services/analysis/the-future-of-small-business-insurance.html',
  'title': 'The Future of Small Business Insurance',
  'content': ['World Impact | Deloitte Global',
   'Our history',
   'Sustainability and Climate',
   'Operate services',
   'The Deloitte Global Boardroom Program',
   'The Deloitte Health Equity Institutes',
   'Inclusion at Deloitte',
   'WorldClass: Empowering 100 million people',
   'The future of small business insurance',
   'What do customers want?',
   'Thanks for your feedback',
   'Your feedback is important to us',
   'Get in touch',
   'Mark Patterson',
   'Technology Driven Systemic Risks and the Continued Need for Innovation',
   'IFRS 17',
   'Technology Driven Systemic Risks and the Continued Need for Innovation',
   'The Evolution Of The Payments Value Model',
   "Let's connect",
   'Follow us',
   '',
   '',
   'Select your location',
   '',
   'No results found',
   'Small business ins

In [20]:
def main(industry, sub_industry):
    article_contents = get_content(industry, sub_industry)
    # Convert articles to LangChain Document format
    documents = []
    for article in article_contents:
        for paragraph in article['content']:
            documents.append(Document(page_content=paragraph, metadata={'source': article['link'], 'title': article['title']}))

    # Create a LangChain retriever and add documents
    retriever = SimpleRetriever(documents=documents)
    return retriever

# Define a function to answer questions using LangChain and OpenAI GPT
def answer_question(question, retriever):
    # Retrieve relevant documents from LangChain
    relevant_docs = retriever.get_relevant_documents(question)
    
    # Concatenate the content of relevant documents
    context = ' '.join([doc.page_content for doc in relevant_docs])
    
    # Use OpenAI's GPT model to generate an answer
    response = openai.Completion.create(
        engine="davinci",  # You can use other engines like "curie" or "babbage"
        prompt=f"Context: {context}\n\nQuestion: {question}\n\nAnswer:",
        max_tokens=150
    )
    
    return response.choices[0].text.strip()






### TO DO: Create Document Retrieval Process and Apply RAG System to Deloittes's Articles 

In [24]:
from langchain import SimpleRetriever
from openai import OpenAI
import openai

ImportError: cannot import name 'SimpleRetriever' from 'langchain' (/Users/jordan/Documents/Data_Science_project/insights-rags/ollama/lib/python3.11/site-packages/langchain/__init__.py)

In [21]:
from langchain import LangChain, Document, SimpleRetriever
from openai import OpenAI
import openai

# # Initialize OpenAI with your API key
# openai.api_key = '...'


if __name__ == '__main__':
    industry = "Financial Services"
    sub_industry = "Insurance"
    retriever = main(industry, sub_industry)
    question = "What are the key findings from the Deloitte survey on Insurance?"
    answer = answer_question(question, retriever)
    print(answer)

https://www.deloitte.com/global/en/Industries/financial-services/analysis/the-future-of-small-business-insurance.html
https://www.deloitte.com/global/en/Industries/real-estate/about.html
https://www.deloitte.com/global/en/Industries/financial-services/perspectives/global-ifrs17-insurance-survey.html
https://www.deloitte.com/global/en/Industries/financial-services/perspectives/ecosystem-imperative.html
https://www.deloitte.com/global/en/Industries/financial-services/research/building-a-future-ready-investment-firm.html
https://www.deloitte.com/global/en/Industries/financial-services/perspectives/underwriting-our-planet.html
https://www.deloitte.com/global/en/Industries/financial-services/perspectives/generative-ai-in-insurance.html
https://www.deloitte.com/global/en/Industries/financial-services/analysis/impact-of-covid-19-on-savings.html
https://www.deloitte.com/global/en/Industries/financial-services/perspectives/pushing-through-undercurrents.html
https://www.deloitte.com/global/en/In

NameError: name 'SimpleRetriever' is not defined

In [None]:
from langchain import LangChain, Document, SimpleRetriever
from openai import OpenAI
import openai

# Initialize OpenAI with your API key



if __name__ == '__main__':
    industry = "Financial Services"
    sub_industry = "Insurance"
    retriever = main(industry, sub_industry)
    question = "What are the key findings from the Deloitte survey on Insurance?"
    answer = answer_question(question, retriever)
    print(answer)

https://www.deloitte.com/global/en/Industries/financial-services/analysis/the-future-of-small-business-insurance.html
https://www.deloitte.com/global/en/Industries/real-estate/about.html
https://www.deloitte.com/global/en/Industries/financial-services/perspectives/global-ifrs17-insurance-survey.html
https://www.deloitte.com/global/en/Industries/financial-services/perspectives/ecosystem-imperative.html
https://www.deloitte.com/global/en/Industries/financial-services/research/building-a-future-ready-investment-firm.html
https://www.deloitte.com/global/en/Industries/financial-services/perspectives/underwriting-our-planet.html
https://www.deloitte.com/global/en/Industries/financial-services/perspectives/generative-ai-in-insurance.html
https://www.deloitte.com/global/en/Industries/financial-services/analysis/impact-of-covid-19-on-savings.html
https://www.deloitte.com/global/en/Industries/financial-services/perspectives/pushing-through-undercurrents.html
https://www.deloitte.com/global/en/In

NameError: name 'SimpleRetriever' is not defined