In [None]:
!mc cp s3/$VAULT_TOP_DIR/Accords/Construction_dataset_public/Dataset_public_accords_teletravail_Dares.parquet .

In [None]:
import pandas as pd
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_chroma import Chroma
from langchain.embeddings import OpenAIEmbeddings, OllamaEmbeddings
from langchain.document_loaders import TextLoader
from langchain.schema import Document, Generation, LLMResult
from langchain.llms import Ollama, BaseLLM
from langchain.chains import StuffDocumentsChain, RetrievalQA, LLMChain
from langchain_core.prompts import PromptTemplate
from langchain_community.llms import OpenAI
from langchain_text_splitters import CharacterTextSplitter
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from pathlib import Path
import json
import requests
from nltk.tokenize import word_tokenize
import spacy
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from tqdm import tqdm

text_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=3000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)



model_kwargs = {'device': 'cuda'}
embedder = HuggingFaceEmbeddings(model_name="BAAI/bge-m3", model_kwargs=model_kwargs,show_progress=False)

In [None]:
file="Dataset_public_accords_teletravail_Dares.parquet"
df=pd.read_parquet(file)

In [None]:
nlp = spacy.load("fr_core_news_lg")
pipe = nlp.pipe(df.texte_complet_accord, n_process=5,
                disable=["tagger", "parser", "attribute_ruler", "ner"])

MOTS_COURANT= {"chapitre","général","accord", "article",  "entreprise", "relatif", "signataire","avenir", "soussigné", "code","travail" , "avenant"}
def preprocess_token(token):
    if str(token).lower() not in MOTS_COURANT and not (token.is_stop or token.is_punct) and token.is_alpha and len(token) >= 3:
        return token.lemma_
    else:
        return ""

def preprocess_text(text_nlp):
    text_pretraite_list = [preprocess_token(token) for token in text_nlp if token]
    text_pretraite = " ".join(text_pretraite_list)
    return text_pretraite

#df["data_pretraites"] = [preprocess_text(texte) for texte in pipe]

In [None]:
def get_count_entity(text):
    texts = text_splitter.create_documents([text])
    for i,t in enumerate(texts):
        pipe = nlp.pipe([t.page_content], n_process=5,
                    disable=["tagger", "parser", "attribute_ruler", "ner"])
        vect = [preprocess_text(texte) for texte in pipe]
        print(vect)
        vectorizer = CountVectorizer(ngram_range=(1,1))
        vectorizer.fit(vect)
        vocab = vectorizer.get_feature_names_out()
        
        comptages = np.array(vectorizer.transform(vect).sum(0))[0]
        comptages_voc = list(zip(vocab, comptages))
        comptages_voc = sorted(comptages_voc, key=lambda x: x[1], reverse=True)
        comptages_voc = pd.DataFrame(comptages_voc, columns=['mot', 'frequence'])  
        
        print(comptages_voc.head(10))

In [None]:
vector_store = Chroma(embedding_function=embedder, persist_directory="./chroma_db")
for index, row in tqdm(df.iterrows(), total=len(df)):
    text=df.texte_complet_accord[index]
    texts = text_splitter.create_documents([text])
    for i,t in enumerate(texts):
        t.metadata["id"]=f"{index}_{i}"
        t.metadata["index"]=f"{index}"
        vector_store.add_documents([t])

In [None]:
!mc cp -r chroma_db s3/$VAULT_TOP_DIR/Accords/