In [36]:
import psycopg
from psycopg import Cursor
from openai import OpenAI
import os

In [37]:
conversation_file_path = '../data/conversation.txt'
openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
db_connection_str = f"dbname=chatbot_rag user=postgres password={os.getenv('DB_PASSWORD')} host=localhost port=5432"

In [38]:
## Le dataset a été récupéré ici : https://www.info.univ-tours.fr/~antoine/parole_publique/
## Ici on traite le fichier. On l'ouvre, on supprime les lignes qui commencent par "<" et on enlève les espaces inutiles.

def create_conversation_list(file_path: str) -> list[str]:
    with open(file_path, encoding="latin-1") as file:
        conversation = file.read()
        text_list = conversation.split("\n")
        filtered_list = [chaine.removeprefix("     ") for chaine in text_list if not chaine.startswith("<")]
        return filtered_list

def calculate_embedding(corpus: str, client: OpenAI) -> list[float]:
    embeddings = client.embeddings.create(
        model="text-embedding-ada-002",
        input=corpus,
        encoding_format="float"
    ).data
    return embeddings[0].embedding

def save_embedding(corpus: str, embedding: list[float], cursor: Cursor) -> None:
    cursor.execute("""
            INSERT INTO embeddings (corpus, embedding) VALUES (%s, %s)
        """, (corpus, embedding))

def retrieve_similar_corpus(input_corpus: str, client: OpenAI, db_connection_str: str) -> tuple[int,str, list[float]]:
    input_corpus_embedding = calculate_embedding(corpus=input_corpus, client=client)
    with psycopg.connect(db_connection_str) as conn:
        with conn.cursor() as cur:
            query = """
                SELECT id, corpus, embedding
                FROM embeddings
                ORDER BY embedding <=> %s::vector
                LIMIT 1;
            """
            cur.execute(query, [input_corpus_embedding])
            result = cur.fetchone()
            return result

def generate_response(input_corpus: str, client: OpenAI=openai_client, db_connection_str: str=db_connection_str):
    similar_text = retrieve_similar_corpus(input_corpus=input_corpus, client=client, db_connection_str=db_connection_str)[1]
    completion = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "developer", "content": "Vous êtes un assistant chatbot serviable travaillant dans le service d'accueil d'une université. Vous devez reformuler des réponses extaites d'une base de données de manière cohérente cohérente et compréhensible pour l'utilisateur"},
        {
            "role": "user",
            "content": similar_text
        }
    ]
)
    return completion.choices[0].message.content

In [28]:
import psycopg
import numpy as np
import os
from dotenv import load_dotenv

load_dotenv()

with psycopg.connect(db_connection_str) as conn:
    with conn.cursor() as cur:
        cur.execute("""
            DROP TABLE embeddings
        """)
        cur.execute("""
            CREATE EXTENSION IF NOT EXISTS vector;
        """)

        cur.execute("""
            CREATE TABLE IF NOT EXISTS embeddings (
                id SERIAL PRIMARY KEY,
                corpus TEXT,
                embedding vector(1536)
            );
        """)

        corpus_list = create_conversation_list(file_path=conversation_file_path)
        for corpus in corpus_list:
            embedding = calculate_embedding(corpus=corpus, client=openai_client)
            save_embedding(corpus=corpus, embedding=embedding, cursor=cur)

        conn.commit()

Objectif : Créer un chatbot qui répond à des questions sur le texte de la conversation.

Recrée une base de données qui contient les embeddings de chaque phrase de la conversation.
Les embeddings sont calculés avec l'API de OpenAI.
La colonne embedding stoque maintenant des vecteurs plutôt que des flottants.

In [47]:
user_message = "Quel est le numéro de l'atelier ?"
retrieve_similar_corpus(input_corpus=user_message, client=openai_client, db_connection_str=db_connection_str)

(24,
 "c: d'accord donc ça c'est le numéro de l'atelier",
 '[-0.017514918,-0.006865848,-0.017285632,-0.0245591,-0.038290795,0.013578838,-0.018839683,-0.0016559559,-0.022228023,-0.00017992598,0.031310305,0.022979572,-0.0021654807,-0.018699564,-0.026444342,-0.002232356,0.023170644,0.0046717064,0.026979342,-0.020431947,0.006235311,0.024788385,0.003506168,-0.0063181086,0.0066429307,-0.008903948,0.0041876575,-0.0038819427,-0.002574693,-0.008528173,0.017183727,0.0112859765,2.4779629e-05,-0.022559214,-0.0032243372,0.0037832223,-0.0051939692,-0.006518734,-0.0004908158,0.01662325,0.043921046,-0.0037768532,0.00271322,-0.02761625,0.0074900156,0.016725155,-0.0106745465,0.0016734708,-0.0161392,0.017451227,-0.008400791,-0.0020253614,-0.027921963,-0.0158717,-0.0033564952,0.0034361084,-0.016088247,0.021310879,-0.0066174543,-0.008623709,0.0014234851,-0.029807206,-0.009502639,0.011712703,-0.01933647,0.001506283,0.020814091,0.0181773,-0.0049678674,0.016202891,0.013731696,0.025246957,0.015374913,-0.017744

In [48]:
generate_response(input_corpus=user_message)

"Oui, c'est bien le numéro attribué à l'atelier. Puis-je vous aider avec autre chose concernant cet atelier ?"