In [1]:
from pathlib import Path

In [2]:
dataset_path = Path.cwd().joinpath("datasets", "halifax_site")

In [3]:
import pandas as pd

In [4]:
training_data = pd.read_csv(
    dataset_path.joinpath("training_data.csv"), sep="|",  lineterminator='\n')

In [5]:
test_data = pd.read_csv(dataset_path.joinpath(
    "test_data.csv"), sep="|",  lineterminator='\n')

In [6]:
training_data.head()

Unnamed: 0,title,content,url,content_cleaned
0,First time buyer calculator,First time buyer calculator Looking to buy yo...,https://www.halifax.co.uk/mortgages/mortgage-c...,Looking to buy your first home? Use our mortga...
1,Home mover calculator,Home mover calculator Ready to move home? Use...,https://www.halifax.co.uk/mortgages/mortgage-c...,Ready to move home? Use our home mover calcula...
2,Remortgage calculator,Remortgage calculator Got a mortgage with a d...,https://www.halifax.co.uk/mortgages/mortgage-c...,Got a mortgage with a different provider? See ...
3,Switching your mortgage,Switching your mortgage If your existing Hali...,https://www.halifax.co.uk/mortgages/mortgage-c...,If your existing Halifax mortgage is coming to...
4,Borrowing more calculator,Borrowing more calculator If you want to borr...,https://www.halifax.co.uk/mortgages/mortgage-c...,"If you want to borrow more on your mortgage, y..."


In [7]:
training_data = training_data[~training_data.content.isna()
                           & ~training_data.url.isna()]

In [8]:
test_data = test_data[~test_data.content.isna() & ~test_data.url.isna()]

In [9]:
test_data.head()

Unnamed: 0,index,title,content,url,content_cleaned
0,42,Already bank with us?,Already bank with us? ...,https://www.halifax.co.uk/mortgages/help-and-a...,Upgrading your account Overdrafts Payments & t...
1,55,Already borrowing with us?,Already borrowing with us? ...,https://www.halifax.co.uk/mortgages/help-and-a...,Existing credit card customers Existing loan c...
2,66,Already with us?,Already with us? ...,https://www.halifax.co.uk/mortgages/help-and-a...,Existing customers Existing customers Manage y...
3,79,Already saving with us?,Already saving with us? ...,https://www.halifax.co.uk/mortgages/help-and-a...,Existing customers Manage your ISA Transfer yo...
4,94,Already investing with us?,Already investing with us? ...,https://www.halifax.co.uk/mortgages/help-and-a...,Sign in to Share Dealing Introducing the new R...


In [10]:
test_data.shape

(377, 5)

In [11]:
from haystack import Document

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
train_documents = [
    Document(content=str(example.content_cleaned), meta={"url": example.url}) for example in training_data.itertuples()
]

In [13]:
test_documents = [
    Document(content=str(example.content_cleaned), meta={"url": example.url}) for example in test_data.itertuples()
]

In [14]:
all_documents = train_documents + test_documents

In [15]:
embedding_model_id = "dunzhang/stella_en_400M_v5"

In [16]:
### need to come back here, and download the stuff.

from haystack import Pipeline

In [17]:
from haystack import Pipeline

In [18]:
indexing_pipeline = Pipeline()

In [19]:
all_documents[0]

Document(id=3396b04a094294cd788f45005d66a5b9a562b37f937d66303b278d85b857fa0c, content: 'Looking to buy your first home? Use our mortgage calculator to work out how much you could borrow to...', meta: {'url': 'https://www.halifax.co.uk/mortgages/mortgage-calculator.html?WT.ac=hp/help/mortgage-calc'})

In [20]:
from haystack.components.writers import DocumentWriter

In [21]:
from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore

In [22]:
from src.rag.shared.database import postgres_uri

In [23]:
import os

In [24]:
os.environ["PG_CONN_STR"] = postgres_uri

In [25]:
document_store = PgvectorDocumentStore(
    embedding_dimension=1024,
    vector_function="cosine_similarity",
    recreate_table=True,
    search_strategy="hnsw",
)

In [26]:
from haystack.document_stores.types import DuplicatePolicy

In [27]:
document_writer = DocumentWriter(
    document_store=document_store, policy=DuplicatePolicy.OVERWRITE)

In [28]:
from haystack.components.embedders import SentenceTransformersDocumentEmbedder

In [29]:
embedder_component = SentenceTransformersDocumentEmbedder(
    model=embedding_model_id,
    normalize_embeddings=True,
    trust_remote_code=True,

)

In [30]:
embedder_component.warm_up()

Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [31]:
indexing_pipeline.add_component(
    "embedder", embedder_component)
indexing_pipeline.add_component("writer", document_writer)


indexing_pipeline.connect("embedder", "writer")

<haystack.core.pipeline.pipeline.Pipeline object at 0x1671e8d00>
🚅 Components
  - embedder: SentenceTransformersDocumentEmbedder
  - writer: DocumentWriter
🛤️ Connections
  - embedder.documents -> writer.documents (List[Document])

In [32]:
indexing_pipeline.run(data={"documents": all_documents[1:]})

Batches: 100%|██████████| 57/57 [01:18<00:00,  1.38s/it]


{'writer': {'documents_written': 1818}}