# Define functions to scrape websites

Install the required dependencies to scrape websites

In [1]:
!pip install beautifulsoup4
!pip install requests



In [2]:
import json
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin
from collections import deque

VALID_GET_RESPONSE = 200


def soup_page(url: str) -> BeautifulSoup:
    """
    Function to make a HTTP request by a given url
    """
    response = requests.get(url)
    if response.status_code == VALID_GET_RESPONSE:
        soup = BeautifulSoup(response.text, "html.parser")
    else:
        soup = BeautifulSoup("")
    return soup


def get_links(soup: BeautifulSoup, url: str, parentFolder: str) -> list:
    """
    Function to get all links from a page
    """
    links = [
        a["href"] for a in soup.find_all("a", href=True) if parentFolder in a["href"] and not "contact-us" in a["href"]
    ]
    return links


def download_page(soup: BeautifulSoup, url: str, savePath: str) -> str:
    """
    Function to download a page
    """
    paragraphsList = [p.get_text() for p in soup.find_all("p")]
    pageTitle = [t.get_text() for t in soup.find_all("title")]
    if paragraphsList:
        siteData = {"title": pageTitle, "url": url, "content": paragraphsList}

        urlPath = url.replace("https://", "").replace("http://", "").replace("/", "_")
        filePath = os.path.join(savePath, urlPath + ".json")
        write_json_from_data(siteData, filePath)
        return filePath
    else:
        return ""


def write_json_from_data(data: dict, filePath: str, indentSize: int = 4) -> None:
    """
    Function to write a json file with its data
    """
    with open(filePath, "w") as f:
        json.dump(data, f, ensure_ascii=False, indent=indentSize)
    return


def download_all_pages(
    rootUrl: str, parentFolder: str, savePath: str, maxPages: int = 100
) -> list[str]:
    """
    Recursive function to download pages and subpages
    """
    visitedUrls = set()
    stack = deque([rootUrl])

    jsonFiles = []
    while stack and len(jsonFiles) < maxPages:
        url = stack.pop()

        if url in visitedUrls:
            continue
        visitedUrls.add(url)

        soup = soup_page(url)
        jsonFileFromSite = download_page(soup, url, savePath)
        print(f"JSON file {jsonFileFromSite} created from url {url}")
        jsonFiles.append(jsonFileFromSite)

        links = get_links(soup, url, parentFolder)
        for link in links:
            fullLink = urljoin(url, link)
            if fullLink not in visitedUrls and "http" in fullLink:
                stack.append(fullLink)

    return jsonFiles


def most_common_sentences_in_file(
    jsonFilePath: str, alreadyCommonWords: set = set(), frequencyThreshold: int = 1
) -> list[str]:
    with open(jsonFilePath, "r") as f:
        data = json.load(f)

    if type(data) == list:
        contentList = [c.get("content", "") for c in data]
        content = []
        for contentPage in contentList:
            content.extend(contentPage)
    else:
        content = data.get("content", "")

    frequencies = {}
    if content:
        for sentence in content:
            frequencies[sentence] = frequencies.get(sentence, 0) + 1

    for sentence, freq in frequencies.items():
        if freq > frequencyThreshold:
            alreadyCommonWords.add(sentence)
    return alreadyCommonWords


def list_json_files_in_folder(
    jsonFilesFolder: str, jsonFilesToExclude: str | list[str]
) -> list[str]:
    if type(jsonFilesToExclude) == str:
        jsonFilesToExclude = [jsonFilesToExclude]

    filesToExclude = set(jsonFilesToExclude)
    fileList = [
        os.path.join(jsonFilesFolder, file)
        for file in os.listdir(jsonFilesFolder)
        if file.endswith(".json") and file not in filesToExclude
    ]
    return fileList


def clean_json_file(
    jsonFilePath: str, mostCommonRows: set[str] = set(), overwrite: bool = True
) -> str:
    with open(jsonFilePath, "r") as f:
        data = json.load(f)

    if type(data) == list:
        for i, page in enumerate(data):
            indexesToRemove = set()
            for j, row in enumerate(page.get("content", "")):
                if row in mostCommonRows:
                    indexesToRemove.add(j)

            data[i]["content"][:] = [
                c for k, c in enumerate(data[i]["content"]) if not k in indexesToRemove
            ]
    else:
        indexesToRemove = set()
        for i, row in enumerate(data.get("content", "")):
            if row in mostCommonRows:
                indexesToRemove.add(i)

        data["content"][:] = [
            c for k, c in enumerate(data["content"]) if k not in indexesToRemove
        ]

    if overwrite:
        jsonCleanedFilePath = jsonFilePath
    else:
        jsonCleanedFilePath = jsonFilePath.replace(".json", "_cleaned.json")

    write_json_from_data(data, jsonCleanedFilePath)
    return jsonCleanedFilePath


def clean_json_files(
    jsonFilesFolder: str, filesToExclude: str | list[str], overwrite: bool = True
) -> list[str]:
    """
    Function to remove unneded text from a json file containing web page content
    """
    fileList = list_json_files_in_folder(jsonFilesFolder, filesToExclude)

    mostCommonRows = set()  # it is possible to define custom common words here
    for file in fileList:
        mostCommonRows = most_common_sentences_in_file(file, mostCommonRows)

    cleanedFiles = []
    for file in fileList:
        jsonCleanedFilePath = clean_json_file(file, mostCommonRows, overwrite)
        cleanedFiles.append(jsonCleanedFilePath)
    return cleanedFiles


def merge_json_files(jsonFilesFolder: str, targetFile: str = "_merged.json") -> str:
    """
    Function to merge multiple json files into a single one
    """
    fileList = list_json_files_in_folder(jsonFilesFolder, targetFile)
    if fileList:
        resultFileContent = []
        for file in fileList:
            with open(file, "r") as f:
                jsonData = json.load(f)
                resultFileContent.append(jsonData)

        resultFilePath = os.path.join(jsonFilesFolder, targetFile)
        write_json_from_data(resultFileContent, resultFilePath)
        return resultFilePath
    return ""


def json_to_txt(savePath: str, jsonFilePath: str, targetName: str = "") -> str:
    if targetName == "":
        targetName = os.path.basename(jsonFilePath).replace(".json", ".txt")

    with open(jsonFilePath, "r") as f:
        data = json.load(f)

    resultFilePath = os.path.join(savePath, targetName)
    with open(resultFilePath, "w") as f:
        for numPage, webpage in enumerate(data):
            pageTitle = webpage.get("title", "")
            f.write(
                f'The page number {numPage+1} with name "{pageTitle[0]}" has the following content:\n'
            )

            pageContent = webpage.get("content", "")
            for row in pageContent:
                f.write(f"{row} \n")

            f.write(
                f'The page number {numPage+1} with name "{pageTitle[0]}" ends here.\n\n'
            )

    return resultFilePath


def text_from_file(filePath: str) -> str | list[str]:
    with open(filePath, "r", encoding="utf-8") as f:
        textString = f.read()
        f.seek(0)
        textList = f.readlines()
    return textString, textList


def get_text_from_webpages(
    root: str, parentFolder: str, savePath: str = "crawled", numPages: int = 5
) -> str:
    os.makedirs(savePath, exist_ok=True)

    jsonFiles = download_all_pages(root, parentFolder, savePath, numPages)

    mergedJsonName = "_merged.json"
    jsonFilesClean = clean_json_files(savePath, mergedJsonName)
    mergedJsonPath = merge_json_files(savePath, mergedJsonName)

    mostCommonWordsMergedJson = most_common_sentences_in_file(
        mergedJsonPath, frequencyThreshold=len(jsonFilesClean) - 1
    )
    mergedJsonPath = clean_json_file(mergedJsonPath, mostCommonWordsMergedJson)

    plainTextFile = json_to_txt(savePath, mergedJsonPath)
    plainText, textString = text_from_file(plainTextFile)
    return plainText

# Call the function to scrape the website

Function arguments of **get_text_from_webpages()** are defined as:
* root : initial webpage, to be scraped and whose links will be scraped, as well
* parentFolder : website folder to inspect, external webpages in the site structure will be skipped
* savePath : folder where to save .json files, containing the content of the webpages
* numPages : maximum number of webpages to download, starting from the initial page (root argument)

In [3]:
parent_folder = "/products-and-solutions/"
initial_url = "https://www.hitachienergy.com/products-and-solutions/"
save_folder = "crawled"
num_pages_to_download = 10
parsed_text = get_text_from_webpages(
    root=initial_url, parentFolder=parent_folder, savePath=save_folder,numPages=num_pages_to_download
)

JSON file crawled/www.hitachienergy.com_products-and-solutions_.json created from url https://www.hitachienergy.com/products-and-solutions/
JSON file crawled/www.hitachienergy.com_products-and-solutions_cybersecurity_alerts-and-notifications.json created from url https://www.hitachienergy.com/products-and-solutions/cybersecurity/alerts-and-notifications
JSON file crawled/www.hitachienergy.com_products-and-solutions_cybersecurity.json created from url https://www.hitachienergy.com/products-and-solutions/cybersecurity
JSON file crawled/www.hitachienergy.com_products-and-solutions_cybersecurity_reporting.json created from url https://www.hitachienergy.com/products-and-solutions/cybersecurity/reporting
JSON file  created from url https://www.hitachienergy.com/products-and-solutions/cybersecurity/reporting/report-a-vulnerability-web-form
JSON file crawled/www.hitachienergy.com_products-and-solutions_digitalization.json created from url https://www.hitachienergy.com/products-and-solutions/di

The variable **parsedText** contains the merged content from all the pages.

To distinguish between pages and help Gemini recognizing them, two sentences are added:
* Before the content of each webpage
* After the content of each webpage

# Start a chat with Gemini

In [4]:
# API key got here: https://ai.google.dev/tutorials/setup

import google.generativeai as genai
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
api_key = user_secrets.get_secret("GEMINI_API_KEY")
genai.configure(api_key = api_key)

model = genai.GenerativeModel(model_name='gemini-1.5-pro-latest')

chat = model.start_chat()

# example how to send prompts
# response = chat.send_message('Recommend me ways to fetch information from a website and forward it to you')
# print(response.text)

# Pass the content of the website to Gemini

In [5]:
system_prompt = "You are given the content of a website.\nThe content for each webpage of the website is after the sentence \"The page number X with name PAGENAME has the following content: \" and before the sentence \"The page number X with name PAGENAME ends here.\" where X is the number of the page and PAGENAME is the name of the webpage.\n"
print(f"The system prompt is:\n{system_prompt}")

The system prompt is:
You are given the content of a website.
The content for each webpage of the website is after the sentence "The page number X with name PAGENAME has the following content: " and before the sentence "The page number X with name PAGENAME ends here." where X is the number of the page and PAGENAME is the name of the webpage.



In [6]:
user_prompt = "Use the content of the website to illustrate the products of the company.\nHighlight the possible business areas in which the product can be sold competitively.\n"

In [7]:
information = f"The content of the website is:\n{parsed_text}"

In [8]:
response = chat.send_message(system_prompt + user_prompt + information)
print(response.text)

Hitachi Energy offers a suite of products and solutions centered around digitalization and optimization of the energy sector. Here's a breakdown based on the provided website content:

**Products and Solutions:**

* **Cybersecurity Solutions:** Automated, evolving, and resilient cybersecurity solutions designed to meet international standards and protect against emerging threats. This includes incident reporting and vulnerability management.
* **Digitalization Solutions:**  Focuses on managing the increasingly complex energy landscape through digital tools and data analysis.  This applies to asset management, renewable energy integration, and optimization of operations for various industries like mining, metals, oil and gas, and transportation.  The e-mesh™ portfolio plays a key role here.
* **Capacity Expansion Solution:** Software for resource planning, capacity expansion, and emissions compliance planning.  Helps manage long-term resource plans, analyze renewable portfolio standards

In [9]:
usage_metadata = response.usage_metadata
print(f"Token count: {usage_metadata.total_token_count}")

Token count: 3741
