In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin

In [2]:
def fetch_website_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Kaster en exception hvis responsen er en 4xx eller 5xx feilkode.

        soup = BeautifulSoup(response.text, 'html.parser')

        # Fjern alle <script>, <style>, og <link> tags
        for script in soup(["script", "style", "link"]):
            script.extract()  # fjerner taggen fra soup

        text = soup.get_text()

        cleaned_text = text.replace('\n', ' ')


        return str(cleaned_text)

    except requests.RequestException as e:
        return f"Feil ved henting av nettside: {e}"

In [3]:
def get_internal_links(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        base_url = urlparse(url).scheme + '://' + urlparse(url).netloc
        internal_links = set()
        
        for a_tag in soup.find_all('a', href=True):
            href = a_tag.attrs['href']
            full_url = urljoin(base_url, href)
            if base_url in full_url:  # Dette sjekker om lenken er intern
                internal_links.add(full_url)
        
        return list(internal_links)
    
    except requests.RequestException as e:
        return []

In [15]:
import pandas as pd

base_url = 'https://www.coax.no/'

In [44]:
def scrape_all_pages_to_dataframe(base_url):
    # Liste for å holde url og tekst
    data_list = []

    # Hent interne lenker fra base_url
    internal_links = get_internal_links(base_url)

    # For hver lenke, hent innholdet og legg til i listen
    for link in internal_links:
        text_content = fetch_website_content(link)
        data_list.append([link, text_content])

    # Konverter listen til en pandas DataFrame
    df = pd.DataFrame(data_list, columns=['URL', 'Text'])

    return df

In [45]:
df_crape = scrape_all_pages_to_dataframe(base_url)

In [46]:
df_crape

Unnamed: 0,URL,Text
0,https://www.coax.no/referanser/,"COAX Norge ⎜Vannvarmer for bolig, hytte og k..."
1,https://www.coax.no/valg-av-modell/,COAX Norge ⎜Varmtvannsbereder uten tank ⎜Fin...
2,https://www.coax.no/hytterogfritidsboliger/,COAX Norge ⎜Vannvarmer uten tank ⎜Finn din f...
3,https://www.coax.no/,"COAX Norge ⎜Spar strøm, miljø og plass ⎜Varm..."
4,https://www.coax.no/våre-produkter/,COAX Norge ⎜El-vannvarmer tilpasset forbruke...
5,https://www.coax.no/kontakt/,COAX Norge⎜Varmtvannsbereder uten tank⎜Konta...
6,https://www.coax.no/valg-av-modell/#ws-block-c...,COAX Norge ⎜Varmtvannsbereder uten tank ⎜Fin...
7,https://www.coax.no/ofte-stilte-spørsmål/,COAX Norge ⎜Spar miljø og penger med tankfri...


In [47]:
def estimate_tokens(text: str) -> int:
    byte_count = len(text.encode('utf-8'))
    return int(byte_count / 4)


In [48]:
estimate_tokens_data = df_crape['Text'].apply(estimate_tokens)
print(df_crape["URL"] + estimate_tokens_data)

0     214
1    1058
2     865
3     649
4    1650
5     269
6    1058
7    2354
Name: Text, dtype: int64


In [11]:
from google.auth.transport.requests import Request
from google.oauth2.service_account import Credentials

# Path to your service account key file
key_path = 'ml-for-fun-393018-5adeefa67764.json' 

credentials = Credentials.from_service_account_file(
    key_path,
    scopes=['https://www.googleapis.com/auth/cloud-platform'])

if credentials.expired:
    credentials.refresh(Request())
    
PROJECT_ID = 'ml-for-fun-393018'
REGION = 'us-central1'

In [42]:
#%pip install google-cloud-aiplatform

In [43]:
import vertexai
vertexai.init(project=PROJECT_ID, location=REGION, credentials = credentials)
from vertexai.language_models import TextEmbeddingModel
embedding_model = TextEmbeddingModel.from_pretrained(
    "textembedding-gecko@001")