In [9]:
pip install sentence_transformers

Note: you may need to restart the kernel to use updated packages.


In [21]:
pip install pinecone-client

Collecting pinecone-client
  Downloading pinecone_client-4.0.0-py3-none-any.whl.metadata (16 kB)
Downloading pinecone_client-4.0.0-py3-none-any.whl (214 kB)
   ---------------------------------------- 0.0/214.5 kB ? eta -:--:--
   ----- --------------------------------- 30.7/214.5 kB 660.6 kB/s eta 0:00:01
   ---------------------------------------- 214.5/214.5 kB 2.2 MB/s eta 0:00:00
Installing collected packages: pinecone-client
Successfully installed pinecone-client-4.0.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
import json

f = open('code_de_la_route_json.json', encoding="utf8")

data = json.load(f)

In [2]:
import re

CLEANR = re.compile('<.*?>|\t|\n') 

def cleanhtml(raw_html):
    cleantext = re.sub(CLEANR, ' ', raw_html)
    return cleantext
    
processed_data = []
#Process data from code de la route
for partie in data['sections']:
    for livre in partie['sections']:
        for titre in livre['sections']:
            for chapitre in titre['sections']:
                for article in chapitre['articles']:
                    processed_data.append({'chapitreNom': chapitre['title'], 'articleNum': article['num'], 'content': cleanhtml(article['content'])})
                


In [3]:
import pandas as pd

df = pd.DataFrame(processed_data)
df

Unnamed: 0,chapitreNom,articleNum,content
0,Chapitre 1er : Responsabilité pénale.,L121-1,Le conducteur d'un véhicule est responsab...
1,Chapitre 1er : Responsabilité pénale.,L121-2,Par dérogation aux dispositions de l'article...
2,Chapitre 1er : Responsabilité pénale.,L121-3,Par dérogation aux dispositions de l'article ...
3,Chapitre 1er : Responsabilité pénale.,L121-4,Sauf cas de versement immédiat d'une amende f...
4,Chapitre 1er : Responsabilité pénale.,L121-4-1,Lorsqu'un avis d'amende forfaitaire majorée c...
...,...,...,...
489,Chapitre 1er : Responsabilité pénale,A121-1,Les informations que le représentant légal ...
490,Chapitre 1er : Responsabilité pénale,A121-1-1,Les informations que la personne physique aya...
491,Chapitre 1er : Responsabilité pénale,A121-2,I.- Lorsque ces informations sont adressées...
492,Chapitre 1er : Responsabilité pénale,A121-3,I.- Lorsque ces informations sont adressées...


In [4]:
from sentence_transformers import SentenceTransformer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

retriever = SentenceTransformer(
    "dangvantuan/sentence-camembert-base",
    device=device
)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
retriever

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: CamembertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [8]:
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key="<<PINECONE API KEY>>")


In [79]:
index_name = "traffic-question-answering-2"

#Need to be the same dimension as the output of our sentence transformer
if index_name not in pc.list_indexes():
    pc.create_index(
        index_name,
        dimension=768,
        metric='dotproduct',
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ) 
    )
index = pc.Index(index_name)

In [80]:

from tqdm.auto import tqdm

# we will use batches of 1
batch_size = 3

for i in tqdm(range(0, len(df), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(df))
    # extract batch
    batch = df.iloc[i:i_end]
    # generate embeddings for batch
    emb = retriever.encode(batch["content"].tolist()).tolist()
    # get metadata
    meta = batch.to_dict(orient="records")
    # create unique IDs
    ids = [f"{idx}" for idx in range(i, i_end)]
    # add all to upsert list
    to_upsert = list(zip(ids, emb, meta))
    # upsert/insert these records to pinecone
    _ = index.upsert(vectors=to_upsert)

# check that we have all vectors in index
index.describe_index_stats()

100%|██████████| 16/16 [00:07<00:00,  2.04it/s]


{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 416}},
 'total_vector_count': 416}

In [12]:
from transformers import AutoTokenizer, CamembertForQuestionAnswering

# load bart tokenizer and model from huggingface
tokenizer = AutoTokenizer.from_pretrained('CATIE-AQ/QAmembert')
model = CamembertForQuestionAnswering.from_pretrained('CATIE-AQ/QAmembert')

In [89]:
def query_pinecone(query, top_k):
    # generate embeddings for the query
    xq = retriever.encode([query]).tolist()
    # search pinecone index for context passage with the answer
    xc = index.query(vector=xq, top_k=top_k, include_metadata=True)
    return xc

def format_query(query, context):
    # extract passage_text from Pinecone search result and add the tag
    context = [f" {m['metadata']['content']}" for m in context]
    # concatinate all context passages
    context = " ".join(context)
    # contcatinate the query and context passages

    return query, context

In [90]:
from transformers import pipeline

#Using pretrained Camembert model for QUestion answering
qa_engine = pipeline(
    "question-answering",
    model="CATIE-AQ/QAmembert",
    tokenizer="CATIE-AQ/QAmembert"
)

def generate_answer(query, context):
    result = qa_engine(
        context=context,
        question=query
    )
    return result

In [91]:
query = "Qui peut suivre un apprentissage en conduite supervisée des véhicules légers ?"
result = query_pinecone(query, top_k=3)

#Get related conext from pinecone database
query, context = format_query(query, result["matches"])

In [93]:
generate_answer(query, context)['answer']

"Toute personne âgée d'au moins dix-huit ans"