In this notebook, we will implementing the retrieval! We will query our postgres database that has our text and the context. Then we will try to implement ranking capabilities.

#### Connecting to PostgreSQL

In [None]:
from pathlib import Path

In [None]:
from os import getenv
from dotenv import load_dotenv, find_dotenv

In [None]:
from urllib.parse import quote

In [None]:
load_dotenv()
database_user = getenv('POSTGRES_USER')
database_password = getenv('POSTGRES_PASSWORD')
database_host = getenv('POSTGRES_HOST')
database_port = getenv('POSTGRES_PORT')
database_name = getenv('POSTGRES_DB')


In [None]:
postgres_uri = f'postgresql://{database_user}:{quote(database_password)}@{database_host}:{database_port}/{database_name}'

In [None]:
from psycopg2 import connect
from pgvector.psycopg2 import register_vector

In [None]:
database_connection = connect(
    user=database_user,
    password=database_password,
    host=database_host,
    port=database_port,
    database=database_name
)

In [None]:
database_connection.set_session(autocommit=True)

In [None]:
from typing import List, Any, Optional, Tuple

In [None]:
def execute_query(database_connection, query, params=None) -> Optional[List[Any]]:
    with database_connection.cursor() as cursor:
        cursor.execute(query, params)
        try:
            return cursor.fetchall()
        except:
            return None

### Disaable accent characters! 

French is a word with dialetric or accent, my queries didn't work with accented character reason why I had to find a way to remove accent from the characters. To achieve that I used what this guide in [postgres recomend](https://www.postgresql.org/docs/current/unaccent.html)

In [None]:
execute_query(database_connection, "create extension if not exists unaccent")

In [None]:
execute_query(database_connection, "CREATE TEXT SEARCH CONFIGURATION unaccent_french ( COPY = french );")

In [None]:
execute_query(database_connection, "ALTER TEXT SEARCH CONFIGURATION unaccent_french ALTER MAPPING FOR hword, hword_part, word WITH unaccent, french_stem;")

###  Searching and Ranking

In [None]:
from random import randint

random_id = randint(1, 1000)

In [None]:
random_article = execute_query(database_connection, f'SELECT content FROM article WHERE id = {random_id}')

In [None]:
from unicodedata import normalize as unicode_normalize

In [None]:
normalize_text = unicode_normalize('NFKD', random_article[0][0])

In [None]:
questions =  ["Quand l’Ordonnance présidentielle a-t-elle été lue sur le plateau de la Radiotélévision nationale congolaise (RTNC)?",
"Qui a été nommé pour remplacer Emmanuel Ramazani Shadary au poste de vice-Premier ministre et ministre de l’Intérieur et sécurité?",
"Où et quand Henri Mova Sakanyi est-il né?",
"Quelle est la carrière politique de Henri Mova Sakanyi en République démocratique du Congo?",
"Quel est le poste actuel de Henri Mova Sakanyi au sein du Parti du peuple pour la Reconstruction et la Démocratie (PPRD)?"]

In [None]:
model_id = "camembert-base"

In [None]:
from sentence_transformers import SentenceTransformer, CrossEncoder

In [None]:
from itertools import chain as itertools_chain

In [None]:
questions[0]

In [None]:
execute_query(database_connection,
              f"select * from phraseto_tsquery('unaccent_french', 'Quand l’Ordonnance présidentielle a-t-elle été lue sur le plateau de la Radiotélévision nationale congolaise (RTNC)?')")

In [None]:
def semantic_search(conn, query: str) -> List[Any]:
    model = SentenceTransformer(model_id)
    embedding = model.encode(query)
    semantic_search_query = 'SELECT id, chunk FROM article_embeddings ORDER BY chunk_vector <=> %(embedding)s LIMIT 5'
    results = execute_query(conn, semantic_search_query, {
                            'embedding': str(embedding.tolist())})
    return results

def keyword_search(conn, query: str) -> List[Any]:

    keyword_search_query_string = """SELECT article_id, chunk 
                                FROM article_embeddings, websearch_to_tsquery(%(language)s, %(query)s) query
                                  WHERE to_tsvector(%(language)s, chunk) @@ query 
                                ORDER BY ts_rank_cd(to_tsvector(%(language)s, chunk), query) DESC LIMIT %(limit)s;"""
    results = execute_query(conn, keyword_search_query_string, {'language': 'unaccent_french', 'query': query, 'limit': 5})
    return results


def rerank(query: str, results: List[Tuple[int, str]]) -> List[Any]:
    # deduplicate
    results = [result[1] for result in results]
    results = set(results)
    # re-rank
    encoder = CrossEncoder(model_id)
    scores = encoder.predict([(query, item) for item in results])
    return [v for _, v in sorted(zip(scores, results), reverse=True)]

In [None]:
for question in questions:
    print(f'Question: {question}')
    semantic_results = [] # semantic_search(database_connection, question)
    keyword_results = keyword_search(database_connection, question)
    results = semantic_results + keyword_results
    # reranked_results = rerank(question, semantic_results + keyword_results)
    for result in results:
        print(f'Article: {result[1]}')
    print(19 * '-')

### Trial to improve keyword search

In [None]:
import spacy
nlp = spacy.load('fr_core_news_md')

In [None]:
spacy_doc = nlp(questions[0])

In [None]:
list(spacy_doc.noun_chunks)

In [None]:
from textacy import extract

In [None]:
questions[0]

In [None]:
 extract.keyterms.textrank(spacy_doc, normalize="lemma", topn=10)

This improvement on the keyword search will help us to retrieve the top three keywords from a question. Then we will use those questions to preform a keyword search in postgres.

In [None]:
def perform_keyword_extraction(text: str) -> str:
    """This function will perform keyword extraction the text supplied.
    It used spacy and texacy and will perform keword exraction and will return those top keywords ready to be used in websearch_text 
    function.
    The keywords will be combined with 'or' operator.
    """
    spacy_doc = nlp(text)
    term_keys = extract.keyterms.textrank(spacy_doc, normalize="lemma", topn=3)
    return " or ".join([f'"{term[0]}"' for term in term_keys])

In [None]:
for question in questions:
    print(f'Question: {question}')
    keywords = perform_keyword_extraction(question)
    results = keyword_search(database_connection, keywords)
    print(f'Keywords: {keywords}')  
    print(19 * '-')
    for result in results:
        print(f'Article: {result[1]}')
    print(19 * '-')

In [None]:
## This where async await code come into consideration

In [None]:
for question in questions:
    print(f'Question: {question}')
    semantic_results = semantic_search(database_connection, question)
    keywords = perform_keyword_extraction(question)
    keyword_results = keyword_search(database_connection, keywords)
    results = semantic_results + keyword_results
    reranked_results = rerank(question, results)
    for i, result in enumerate(results, 1):
        print(f'Article {i}: {result[1]}')
    print(19 * '-')

By the look at the model, I can see that It nede some finnetuning on the text to generate better results. But that is a step of another day.

With this the retrieval part is completed, the next step will be to use a small model call the fusion in decoder to perform generative question anwering.