In [6]:
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from tqdm import tqdm
import json
import requests as r
import glob

In [None]:

BASE_URL = 'https://www.dansketaler.dk/'

In [25]:

driver = webdriver.Chrome()

driver.get("https://www.dansketaler.dk/taler/frederiksen-mette")
time.sleep(2)
res = driver.page_source
soup = BeautifulSoup(res, features='lxml')

In [22]:
speeches = []
for a in soup.find_all("a"):
    if a["href"][0:5] == "/tale":
        speeches.append(a["href"])

In [32]:
for speech in speeches:
    driver.get(f"{BASE_URL}{speech}")
    time.sleep(1)
    title = speech.split("/")[2]
    speech_soup = BeautifulSoup(driver.page_source)
    speech_html = speech_soup.find("div", attrs={'id': 'speech'})
    with open(f"taler/{title}.txt", "w") as f:
        f.write(speech_html.get_text("\n"))


In [2]:
from sentence_transformers import SentenceTransformer
import psycopg2 as pg
import regex as re

In [3]:
model_name = 'Snowflake/snowflake-arctic-embed-l-v2.0'
model = SentenceTransformer(model_name)

In [4]:
def embed_documents(speech, document, embedding):
    conn = pg.connect("dbname=vector_rag user=postgres password=postgres")
    conn.autocommit = True
    cur = conn.cursor()
    cur.execute(
        "INSERT INTO speeches_embeddings (speech, context, embedding) VALUES (%s, %s, %s)",
        (speech, document, embedding),
    )
    cur.close()
    conn.close()


In [7]:
total = 0
files = glob.glob("*", root_dir="taler")
with tqdm(total = len(files), desc="Processing Files") as fpbar:
    for file_name in glob.glob("*", root_dir="taler"):
        speech_name = file_name.replace(".txt", "")
        with open(f"taler/{file_name}", "r") as f: 
            lines = f.readlines()
            total += len(lines)
        context_splits = []
        
        char_split = False
        for line in lines:
            if re.search('[a-zA-Z]', line) is None and "\n" == line:
                char_split = True
                break

        if char_split:
            context = []
            for line in lines:
                if re.search('[a-zA-Z]', line) is None and "\n" == line:
                    context_splits.append(context)
                    context = []
                else:
                    context.append(line)
            if context != []:
                context_splits.append(context)
        else:
            chunk_size = 4  # group size
            overlap = 2  # overlap size
            context_splits = [lines[i:i+chunk_size] for i in range(0, len(lines), chunk_size-overlap)]

        with tqdm(total = total, desc="Embedding and Saving context") as pbar:
            for context in context_splits:
                for line in context:
                    embedding = model.encode(line)
                    str_context = "\n".join(context)
                    embed_documents(speech_name, str_context, str(embedding.tolist()))
                    pbar.update(1)
    fpbar.update(1)
                



Processing Files:   0%|                                                                                | 0/153 [00:00<?, ?it/s]
[Aedding and Saving context:   0%|                                                                    | 0/144 [00:00<?, ?it/s]
[Aedding and Saving context:   1%|▍                                                           | 1/144 [00:05<13:58,  5.86s/it]
[Aedding and Saving context:   1%|▊                                                           | 2/144 [00:07<07:49,  3.31s/it]
[Aedding and Saving context:   2%|█▎                                                          | 3/144 [00:07<04:36,  1.96s/it]
[Aedding and Saving context:   3%|█▋                                                          | 4/144 [00:08<03:48,  1.63s/it]
[Aedding and Saving context:   3%|██                                                          | 5/144 [00:09<02:32,  1.10s/it]
[Aedding and Saving context:   4%|██▌                                                         | 6/144 [

KeyboardInterrupt: 