In [3]:
import psycopg
from psycopg import Cursor
from openai import OpenAI
import os

In [4]:
conversation_file_path = '../data/domotik.txt'
openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
db_connection_str = f"dbname=chatbot_rag_domotique user=postgres password={os.getenv('DB_PASSWORD')} host=localhost port=5432"

In [5]:
## Ici on traite le fichier. On l'ouvre, on supprime les lignes qui commencent par "<" et on enlève les espaces inutiles.

def create_conversation_list(file_path: str) -> list[str]:
    with open(file_path, encoding="latin-1") as file:
        conversation = file.read()
        text_list = conversation.split("\n")
        filtered_list = [chaine.removeprefix("     ") for chaine in text_list if not chaine.startswith("<")]
        return filtered_list

def calculate_embedding(corpus: str, client: OpenAI) -> list[float]:
    embeddings = client.embeddings.create(
        model="text-embedding-ada-002",
        input=corpus,
        encoding_format="float"
    ).data
    return embeddings[0].embedding

def save_embedding(corpus: str, embedding: list[float], cursor: Cursor) -> None:
    cursor.execute("""
            INSERT INTO embeddings (corpus, embedding) VALUES (%s, %s)
        """, (corpus, embedding))

def retrieve_similar_corpus(input_corpus: str, client: OpenAI, db_connection_str: str) -> tuple[int,str, list[float]]:
    input_corpus_embedding = calculate_embedding(corpus=input_corpus, client=client)
    with psycopg.connect(db_connection_str) as conn:
        with conn.cursor() as cur:
            query = """
                SELECT id, corpus, embedding
                FROM embeddings
                ORDER BY embedding <=> %s::vector
                LIMIT 1;
            """
            cur.execute(query, [input_corpus_embedding])
            result = cur.fetchone()
            return result

def generate_response(input_corpus: str, client: OpenAI=openai_client, db_connection_str: str=db_connection_str):
    similar_text = retrieve_similar_corpus(input_corpus=input_corpus, client=client, db_connection_str=db_connection_str)[1]
    completion = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "developer", "content": "Tu es un assistant domotique qui répond une ligne de commande en python lorsque l'utilsiateur te pose une question"},
        {
            "role": "user",
            "content": similar_text
        }
    ]
)
    return completion.choices[0].message.content

In [7]:
import psycopg
import numpy as np
import os
from dotenv import load_dotenv

load_dotenv()

with psycopg.connect(db_connection_str) as conn:
    with conn.cursor() as cur:
        cur.execute("""
            DROP TABLE IF EXISTS embeddings
        """)
        cur.execute("""
            CREATE EXTENSION IF NOT EXISTS vector;
        """)

        cur.execute("""
            CREATE TABLE IF NOT EXISTS embeddings (
                id SERIAL PRIMARY KEY,
                corpus TEXT,
                embedding vector(1536)
            );
        """)

        corpus_list = create_conversation_list(file_path=conversation_file_path)
        for corpus in corpus_list:
            embedding = calculate_embedding(corpus=corpus, client=openai_client)
            save_embedding(corpus=corpus, embedding=embedding, cursor=cur)

        conn.commit()

Objectif : Créer un chatbot qui répond à des questions sur le texte de la conversation.

Recrée une base de données qui contient les embeddings de chaque phrase de la conversation.
Les embeddings sont calculés avec l'API de OpenAI.
La colonne embedding stoque maintenant des vecteurs plutôt que des flottants.

In [8]:
user_message = "allume la lumière de mon salon"
retrieve_similar_corpus(input_corpus=user_message, client=openai_client, db_connection_str=db_connection_str)

(9,
 'h: je vais voir',
 '[-0.012731301,-0.0115277795,-0.008561263,-0.011970155,-0.0035455078,0.021012826,-0.0119831655,-0.018957082,-0.012269408,-0.019880865,0.019399457,0.014234075,0.016406918,-0.013199698,0.0005749252,-0.03786212,0.032527596,0.012549146,0.025059259,-0.035051737,0.0022118764,-0.0009522453,0.03198113,-0.01133912,-0.0056858235,-0.008242493,0.0051458655,-0.00827502,7.461017e-05,-0.000993718,0.037263613,0.016471975,-0.002385899,-0.01615971,-0.0012913455,-0.003057594,-0.0014653681,-0.0013970602,0.007487852,0.003958608,0.020739594,-0.008183943,0.006316859,-0.02531948,0.005324767,0.019633656,-0.007032466,-0.016471975,-0.0035227386,0.005718351,0.01843664,0.0041732905,-0.02356299,-0.008125393,-0.008379108,0.018137386,0.002894956,0.013687612,0.009374453,0.0012490596,0.019360425,-0.022795338,-0.038929027,0.012334464,-0.0017581165,0.0003537376,-0.0016369512,-0.0006318485,-0.014390208,0.005178393,0.0049539525,0.028312018,0.007182093,-0.009868872,0.04965012,0.015092804,-0.02621724

In [None]:
generate_response(input_corpus=user_message)