I will try to create embedding for the congonews databaset.

The dataset is has around 87k documents I will create embedding for those document and ingest them in a postgres database.

#### Connecting to PostgreSQL

In [None]:
from pathlib import Path

In [None]:
from os import getenv
from dotenv import load_dotenv, find_dotenv

In [None]:
from urllib.parse import quote

In [None]:
load_dotenv()
database_user = getenv('POSTGRES_USER')
database_password = getenv('POSTGRES_PASSWORD')
database_host = getenv('POSTGRES_HOST')
database_port = getenv('POSTGRES_PORT')
database_name = getenv('POSTGRES_DB')


In [None]:
postgres_uri = f'postgresql://{database_user}:{quote(database_password)}@{database_host}:{database_port}/{database_name}'

In [None]:
from psycopg2 import connect
from pgvector.psycopg2 import register_vector

In [None]:
database_connection = connect(
    user=database_user,
    password=database_password,
    host=database_host,
    port=database_port,
    database=database_name
)

In [None]:
database_connection.set_session(autocommit=True)

In [None]:
with database_connection.cursor() as cursor:
    cursor.execute("CREATE EXTENSION IF NOT EXISTS vector")

In [None]:
register_vector(database_connection)

In [None]:
table_creation_string = """
CREATE TABLE IF NOT EXISTS article_embeddings (
    id SERIAL PRIMARY KEY,
    article_id INTEGER,
    chunk TEXT,
    chunk_vector VECTOR(768),
    CONSTRAINT fk_article_id FOREIGN KEY (article_id) REFERENCES article(id)
);
"""

insert_statement_string = """
INSERT INTO article_embeddings (article_id, chunk, chunk_vector)
values (%(article_id)s, %(chunk)s, %(chunk_vector)s)
"""

update_statement_string = """
ON CONFLICT (id) DO UPDATE SET
article_id = EXCLUDED.article_id,
chunk = EXCLUDED.chunk,
chunk_vector = EXCLUDED.chunk_vector,
"""

In [None]:
from typing import List, Any, Optional

In [None]:
def execute_query(database_connection, query, params=None) -> Optional[List[Any]]:
    with database_connection.cursor() as cursor:
        cursor.execute(query, params)
        try:
            return cursor.fetchall()
        except:
            return None

In [None]:
with database_connection.cursor() as cursor:
    cursor.execute(table_creation_string)
    

### Insert data in the database

This is the trickiest part, we need to load the article, split the article into chunk, compute the embedding for each chunk and then save the embedding as chunk in the vector!

I will start simple with querying the database and load around 60 document and save those documents. Then we will scale the process.

#### Load Dataset

In this section we will load news data table from the database to the huggingface dataset.

In [None]:
from datasets import Dataset

In [None]:
from datasets import Value, Features

"""
    id             | integer                     |           | not null | nextval('article_id_seq'::regclass)
 title          | character varying(250)      |           | not null |
 content        | text                        |           | not null |
 summary        | text                        |           |          |
 posted_at      | timestamp without time zone |           |          |
 website_origin | character varying(250)      |           |          |
 url            | character varying(250)      |           |          |
 author         | character varying(250)      |           |          |
 saved_at       | timestamp with time zone
"""

features = Features({
    'id': Value('int32'),
    'title': Value('string'),
    'content': Value('string'),
    'summary': Value('string'),
    'posted_at': Value('string'),
    'website_origin': Value('string'),
    'url': Value('string'),
    'author': Value('string'),
    'saved_at': Value('string'),
})

In [None]:
congo_news_dataset = Dataset.from_sql(
    'article', postgres_uri, features=features)

In [None]:
from haystack import Document

In [None]:
haystack_documents = [
    Document(content=example['content'], id=example["id"], meta={}) for example in congo_news_dataset
]

In [None]:
from haystack.components.preprocessors import DocumentCleaner


document_cleaner = DocumentCleaner(remove_substrings=[
                                   r"This post has already been read \d+ times!"], 
                                   remove_regex="",
                                   keep_id=True)

In [None]:
import random

# Sample one number from the range 1 to 10
random_document_id = random.randint(1, len(haystack_documents))
print(random_document_id)

In [None]:
clean_documents = document_cleaner.run(haystack_documents[random_document_id:random_document_id+5])

In [None]:
clean_documents.get("documents")[0].content

### Trying text splitter

In [None]:
from src.retriever.document_splitter import RecursiveCharacterTextSplitterComponent
recursive_text_splitter = RecursiveCharacterTextSplitterComponent(
    chunk_size=300, chunk_overlap=50)

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
split_documents = recursive_text_splitter.run(haystack_documents[:5])

In [None]:
split_documents

The recursive character splitter offer a better retrieveal accuracy for the reason specified here: https://www.reddit.com/r/LangChain/comments/1bjxvov/what_is_the_advantage_of_overlapping_in_chunking/

### Let us build the document Store

I will come back to this, create the document store.vb 

In [None]:
from src.retriever.document_store import MyPgVectorDocumentStore

In [None]:
import os
os.environ["PG_CONN_STR"] = postgres_uri

In [None]:
from haystack.utils.auth import Secret

In [None]:
document_store = MyPgVectorDocumentStore(
    embedding_dimension=768,
    vector_function="cosine_similarity",
    recreate_table=False,
    table_name="article_embeddings",
    connection_string=Secret.from_env_var("PG_CONN_STR"),
    sql_insert_string=insert_statement_string,
    sql_update_string=update_statement_string,
    language="french"
)

### Creating the embedding model

In [None]:

model_id = "camembert-base"

In [None]:
from haystack.components.writers import DocumentWriter
from haystack.components.embedders import SentenceTransformersDocumentEmbedder

In [None]:
embedder_component = SentenceTransformersDocumentEmbedder(
    model=model_id,
    normalize_embeddings=True,
    
)
embedder_component.warm_up()

In [None]:
document_with_embeddings = embedder_component.run(split_documents.get("documents"))

In [None]:
document_with_embeddings

Not sure if the model is working, but I will come back here to check if the model was working.. 

### Writing Documents to the Database

In [None]:
from haystack.components.writers import DocumentWriter

In [None]:
document_writer = DocumentWriter(document_store=document_store)

In [None]:
document_writer.run(document_with_embeddings.get("documents"))

### Testing the Retriever

In [None]:
from psycopg.sql import Literal as SQLLiteral

In [None]:
select_by_keyword_query_string = """
select article_id, chunk, ts_rank_cd(to_tsvector(%(language)s, chunk), query) as score
from article_embeddings, plainto_tsquery(%(language)s, %(query)s) query
where to_tsvector(%(language)s, chunk) @@ query 
order by score desc 
limit %(limit)s
"""

In [None]:
select_by_keyword_query_string

In [None]:
results = execute_query(database_connection, select_by_keyword_query_string, {"language": "french", "query": "francophonie", "limit": 5})

In [None]:
results

At this point, we have tested that we can write the document in our datastore, let us now write all the document to the store.

In [None]:
from haystack import Pipeline


In [None]:
index_pipeline = Pipeline()

In [None]:
index_pipeline.add_component("text_cleaner", document_cleaner)
index_pipeline.add_component("text_splitter", recursive_text_splitter)
index_pipeline.add_component("embedder", embedder_component)
index_pipeline.add_component("writer", document_writer)

In [None]:
index_pipeline.connect("text_cleaner", "text_splitter")
index_pipeline.connect("text_splitter", "embedder")
index_pipeline.connect("embedder", "writer")

In [None]:
index_pipeline.run( {"documents": haystack_documents})