# Extracting data from web pages 
✅ In this noteBook we are going to extract data from relevant webpages relative to our needs based on links using Langchain `UnstructuredLoader` so we can feed the documents list to RAG after converting into Vector Embeddings
 than we are going to preprocess them by splitting , embedding and saving it to a vector database and the database we are going to use is `CHROMADB`

 
 ![image.png](attachment:43b9fe21-fade-4e45-b738-9586a7aa0bfa.png)

 the PDFS will be processed in another notebook 😊

In [2]:
# %pip install -qU langchain-community beautifulsoup4

In [3]:
# %pip install langchain-unstructured

In [4]:
# pip cache purge


In [5]:
# pip install langchain sentence-transformers chromadb


In [6]:
# %pip install ollama

In [7]:
# pip install -U sentence-transformers


In [8]:
# pip install langchain_openai

In [9]:
# # pip install -U langchain-huggingface
# %pip install -U langchain-ollama
# %pip install chromadb
# %pip install chromadb
# ! pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain

# Importing Library that we are gonna use 

In [None]:
from typing import List
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader
from langchain_unstructured import UnstructuredLoader
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
import os
import bs4
from sentence_transformers import SentenceTransformer
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_huggingface import HuggingFaceEmbeddings
import ollama
from langchain.embeddings import OllamaEmbeddings, HuggingFaceEmbeddings
from langchain_ollama import OllamaEmbeddings
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import chromadb
import uuid
import pandas as pd 
import os
from sklearn.manifold import TSNE
from langchain.embeddings import HuggingFaceEmbeddings




In [None]:
# embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

## Loading the model we are gonne use for transorming to embedding 
the model is open source and is good one for embedding to be used later in retrival 

In [None]:
# embeddings_model = SentenceTransformer("HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5")
embeddings_model = HuggingFaceEmbeddings(model_name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5")


In [None]:
# data.delete(ids=data.get()['ids'])


# Configure `ChromaDB` for our work 

In [None]:
# chroma_client.delete_collection(name="my_dataaaa")  # Deletes "my_dataaaa"


In [None]:
chroma_db_path = "./chroma_db"  
chroma_client = chromadb.PersistentClient(path=chroma_db_path)

In [None]:
data  = chroma_client.get_or_create_collection(name="my_dataaaa")

#  <p style="color: orange;">Document 0 Masteres-Procedure-de-Depot</p>

In [None]:
loader = WebBaseLoader(
    web_paths=("https://fsm.rnu.tn/fra/pages/152/Masteres-Procedure-de-Depot",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("content")
        )
    ),
)
Masteres_Procedure_de_Depot = loader.load()

In [None]:
Masteres_Procedure_de_Depot

## spliiting into chunks the doc0

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits1 = text_splitter.split_documents( Masteres_Procedure_de_Depot)

In [None]:
splits1

## Saving to chromadb in data 

In [None]:
contents1 = [doc.page_content for doc in splits1]
metadata1 = [doc.metadata for doc in splits1]

In [None]:
embeddings1 = embeddings_model.encode(
    [doc.page_content for doc in splits1], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings1)

In [None]:
ids = [str(uuid.uuid4()) for _ in range(len(contents1))]

In [None]:
data.add(
    documents=contents1,
    embeddings=embeddings1,
    metadatas=metadata1,
    ids=ids
)

In [None]:
# visulizing in a dataframe 
data_dict = {
    "ID": ids,
    "Document": contents1,
    "Metadata": metadata1,
    "Embedding Shape": [np.array(embed).shape for embed in embeddings1],
}

df = pd.DataFrame(data_dict)
df.tail()

In [None]:
def append_data(contents, metadata, embeddings):
    '''this function will append the embeddings and metadata and 
    the document into the data_dict so we can visulize how it looks in chrom '''
    global df
    new_ids = list(range(len(df) + 1, len(df) + 1 + len(contents)))
    
    data_dict["ID"].extend(new_ids)
    data_dict["Document"].extend(contents)
    data_dict["Metadata"].extend(metadata)
    data_dict["Embedding Shape"].extend([np.array(embed).shape for embed in embeddings])
    
    df = pd.DataFrame(data_dict)


#  <p style="color: orange;">Document 1 Theses-Inscriptions-etProcedure-de-Depot</p>

In [None]:
loader = WebBaseLoader(
    web_paths=("https://fsm.rnu.tn/fra/pages/147/Theses-Inscriptions-etProcedure-de-Depot",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("content")
        )
    ),
)
Theses_Inscriptions_etProcedure_de_Depot = loader.load()

In [None]:
Theses_Inscriptions_etProcedure_de_Depot

## splitting into chunks the doc1

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits2 = text_splitter.split_documents( Theses_Inscriptions_etProcedure_de_Depot)

In [None]:
splits2

In [None]:
contents2= [doc.page_content for doc in splits2]
metadata2 = [doc.metadata for doc in splits2]

In [None]:
embeddings2 = embeddings_model.encode(
    [doc.page_content for doc in splits2], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings2)

In [None]:
ids2= [str(uuid.uuid4()) for _ in range(len(contents2))]

In [None]:
data.add(
    documents=contents2,
    embeddings=embeddings2,
    metadatas=metadata2,
    ids=ids2
)

In [None]:
append_data(contents2, metadata2, embeddings2)


In [None]:
df

#  <p style="color: orange;"> Document 2  رشة_بعنوان_أهمية_الصحة_النفسية</p>

In [None]:
loader = WebBaseLoader(
    web_paths=("https://fsm.rnu.tn/fra/articles/4798/%D9%88%D8%B1%D8%B4%D8%A9-%D8%A8%D8%B9%D9%86%D9%88%D8%A7%D9%86-%D8%A3%D9%87%D9%85%D9%8A%D8%A9-%D8%A7%D9%84%D8%B5%D8%AD%D8%A9-%D8%A7%D9%84%D9%86%D9%81%D8%B3%D9%8A%D8%A9",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("content")
        )
    ),
)
warcha_mental_health = loader.load()

In [None]:
warcha_mental_health

## spitting doc 2 into chunks 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits3 = text_splitter.split_documents( warcha_mental_health)

In [None]:
splits3

In [None]:
contents3= [doc.page_content for doc in splits3]
metadata3 = [doc.metadata for doc in splits3]

In [None]:
embeddings3 = embeddings_model.encode(
    [doc.page_content for doc in splits3], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings3)

In [None]:
ids3 = [str(uuid.uuid4()) for _ in range(len(contents3))]

In [None]:
data.add(
    documents=contents3,
    embeddings=embeddings3,
    metadatas=metadata3,
    ids=ids3
)

In [None]:
append_data(contents3, metadata3, embeddings3)


In [None]:
df.tail()

# <p style="color: orange;"> Document 3 festival-de-la-creativite-estudiantine</p>

In [None]:
loader = WebBaseLoader(
    web_paths=("https://fsm.rnu.tn/fra/articles/4795/festival-de-la-creativite-estudiantine",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("content")
        )
    ),
)
festival_de_la_creativite_estudiantinet = loader.load()

In [None]:
festival_de_la_creativite_estudiantinet

## splitting the Doc3  into chunks 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits4 = text_splitter.split_documents( festival_de_la_creativite_estudiantinet)

In [None]:
print(splits4[0].page_content)  # First chunk's content
print(splits4[0].metadata) 

In [None]:
contents4= [doc.page_content for doc in splits4]
metadata4 = [doc.metadata for doc in splits4]

In [None]:
embeddings4 = embeddings_model.encode(
    [doc.page_content for doc in splits4], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings4)

In [None]:
ids4 = [str(uuid.uuid4()) for _ in range(len(contents4))]

In [None]:
data.add(
    documents=contents4,
    embeddings=embeddings4,
    metadatas=metadata4,
    ids=ids4
)

In [None]:
append_data(contents4, metadata4, embeddings4)


In [None]:
df

# <p style="color: orange;"> Document 4 bourses-d-alternance-2025</p>

In [None]:
loader = WebBaseLoader(
    web_paths=("https://fsm.rnu.tn/fra/articles/4813/bourses-d-alternance-2025",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("content")
        )
    ),
)
Bourse_alternance = loader.load()

In [None]:
Bourse_alternance

## splitting doc 4 into chunks 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits5 = text_splitter.split_documents( Bourse_alternance)

In [None]:
print(splits5[2].page_content)  
print(splits5[2].metadata) 

In [None]:
contents5= [doc.page_content for doc in splits5]
metadata5 = [doc.metadata for doc in splits5]

In [None]:
embeddings5 = embeddings_model.encode(
    [doc.page_content for doc in splits5], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings5)

In [None]:
ids5 = [str(uuid.uuid4()) for _ in range(len(contents5))]

In [None]:
data.add(
    documents=contents5,
    embeddings=embeddings5,
    metadatas=metadata5,
    ids=ids5
)

In [None]:
append_data(contents5, metadata5, embeddings5)


In [None]:
df

# <p style="color: orange;"> Document 5 the-indian-council-for-cultural-relations--iccr</p>

In [None]:
loader = WebBaseLoader(
    web_paths=("https://fsm.rnu.tn/fra/articles/4807/the-indian-council-for-cultural-relations--iccr-",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("content")
        )
    ),
)
the_indian_council_for_cultural_relations = loader.load()

In [None]:
the_indian_council_for_cultural_relations

## splitting doc 5 into chunks 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits6 = text_splitter.split_documents( the_indian_council_for_cultural_relations)

In [None]:
splits6

In [None]:
contents6= [doc.page_content for doc in splits6]
metadata6 = [doc.metadata for doc in splits6]

In [None]:
embeddings6 = embeddings_model.encode(
    [doc.page_content for doc in splits6], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings6)

In [None]:
ids6 = [str(uuid.uuid4()) for _ in range(len(contents6))]

In [None]:
data.add(
    documents=contents6,
    embeddings=embeddings6,
    metadatas=metadata6,
    ids=ids6
)

In [None]:
append_data(contents6, metadata6, embeddings6)


In [None]:
df

In [None]:
# page_url = "https://fsm.rnu.tn/useruploads/files/au2425/NV%20ICCR.pdf"
# loader = PyPDFLoader(page_url)

# applications_guidelines_indian = []
# async for doc in loader.alazy_load():
#     applications_guidelines_indian.append(doc)

In [None]:
# applications_guidelines_indian

In [None]:
# documents6

In [None]:
# pip install "unstructured[pdf]"

#  <p style="color: orange;"> Document 6 Règlement intérieur des examens</p>

In [None]:
loader = WebBaseLoader(
    web_paths=("https://fsm.rnu.tn/fra/pages/346/R%C3%A8glement-int%C3%A9rieur-des-examens",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("content")
        )
    ),
)
Règlement_intérieur_des_examens = loader.load()

In [None]:
Règlement_intérieur_des_examens

## splitting doc 6 into chunks 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits7 = text_splitter.split_documents( Règlement_intérieur_des_examens)

In [None]:
splits7

In [None]:
contents7= [doc.page_content for doc in splits7]
metadata7 = [doc.metadata for doc in splits7]

In [None]:
embeddings7 = embeddings_model.encode(
    [doc.page_content for doc in splits7], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings7)

In [None]:
ids7 = [str(uuid.uuid4()) for _ in range(len(contents7))]

In [None]:
data.add(
    documents=contents7,
    embeddings=embeddings7,
    metadatas=metadata7,
    ids=ids7
)

In [None]:
append_data(contents7, metadata7, embeddings7)


In [None]:
df

#  <p style="color: orange;">Document 7 Gestion des Stages & PFE (CPE-BR-01-00)</p>

In [None]:
loader = WebBaseLoader(
    web_paths=("https://fsm.rnu.tn/fra/pages/73/Stages-&-PFE",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("content")
        )
    ),
)
Stages_PFE = loader.load()

In [None]:
Stages_PFE

## splitting doc 7 into chunks

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits8 = text_splitter.split_documents( Stages_PFE)

In [None]:
splits8

In [None]:
contents8= [doc.page_content for doc in splits8]
metadata8 = [doc.metadata for doc in splits8]

In [None]:
embeddings8= embeddings_model.encode(
    [doc.page_content for doc in splits8], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings8)

In [None]:
ids8 = [str(uuid.uuid4()) for _ in range(len(contents8))]

In [None]:
data.add(
    documents=contents8,
    embeddings=embeddings8,
    metadatas=metadata8,
    ids=ids8
)

In [None]:
append_data(contents8, metadata8, embeddings8)


In [None]:
df

#  <p style="color: orange;">Document 8 Procédure de déroulement des stages facultatifs (CPE-IN-01-00)</p>

In [None]:
loader = WebBaseLoader(
    web_paths=("https://fsm.rnu.tn/fra/pages/437/Proc%C3%A9dure-de-d%C3%A9roulement-des-stages-facultatif",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("content")
        )
    ),
)
Procédure_de_déroulement_des_stages_facultatifs = loader.load()

In [None]:
Procédure_de_déroulement_des_stages_facultatifs

## splitting doc 8 into chunks 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits9 = text_splitter.split_documents( Procédure_de_déroulement_des_stages_facultatifs)

In [None]:
splits9

In [None]:
contents9= [doc.page_content for doc in splits9]
metadata9 = [doc.metadata for doc in splits9]

In [None]:
embeddings9 = embeddings_model.encode(
    [doc.page_content for doc in splits9], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings9)

In [None]:
ids9 = [str(uuid.uuid4()) for _ in range(len(contents9))]

In [None]:
data.add(
    documents=contents9,
    embeddings=embeddings9,
    metadatas=metadata9,
    ids=ids9
)

In [None]:
append_data(contents9, metadata9, embeddings9)


In [None]:
df

#  <p style="color: orange;"> Document 9 Procédure de déroulement des stages obligatoires (CPE-IN-02-00)</p>

In [None]:
loader = WebBaseLoader(
    web_paths=("https://fsm.rnu.tn/fra/pages/75/Proc%C3%A9dure-de-d%C3%A9roulement-des-stages",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("content")
        )
    ),
)
Procédure_de_déroulement_des_stages_obligatoires = loader.load()

In [None]:
Procédure_de_déroulement_des_stages_obligatoires

## splitting doc 9 into chunks 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits10= text_splitter.split_documents(Procédure_de_déroulement_des_stages_obligatoires)

In [None]:
splits10

In [None]:
contents10= [doc.page_content for doc in splits10]
metadata10 = [doc.metadata for doc in splits10]

In [None]:
embeddings10 = embeddings_model.encode(
    [doc.page_content for doc in splits10], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings10)

In [None]:
ids10 = [str(uuid.uuid4()) for _ in range(len(contents10))]

In [None]:
data.add(
    documents=contents10,
    embeddings=embeddings10,
    metadatas=metadata10,
    ids=ids10
)

In [None]:
append_data(contents10, metadata10, embeddings10)


In [None]:
df

#  <p style="color: orange;"> Document 10 Partenariat international</p>

In [None]:
loader = WebBaseLoader(
    web_paths=("https://fsm.rnu.tn/fra/pages/9/Partenariat-international",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("content")
        )
    ),
)
Partenariat_international = loader.load()

In [None]:
Partenariat_international

## splitting doc 10 into chunks 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits11 = text_splitter.split_documents(Partenariat_international)

In [None]:
splits11

In [None]:
contents11= [doc.page_content for doc in splits11]
metadata11 = [doc.metadata for doc in splits11]

In [None]:
embeddings11 = embeddings_model.encode(
    [doc.page_content for doc in splits11], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings11)

In [None]:
ids11 = [str(uuid.uuid4()) for _ in range(len(contents11))]

In [None]:
data.add(
    documents=contents11,
    embeddings=embeddings11,
    metadatas=metadata11,
    ids=ids11
)

In [None]:
append_data(contents11, metadata11, embeddings11)


In [None]:
df

#  <p style="color: orange;"> Document 11 Communication</p>

In [None]:
loader = WebBaseLoader(
    web_paths=("https://fsm.rnu.tn/fra/pages/140/Communication",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("content")
        )
    ),
)
Communication = loader.load()

In [None]:
Communication

## splitting doc 11 into chunks 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits12 = text_splitter.split_documents(Communication)

In [None]:
splits12

In [None]:
contents12= [doc.page_content for doc in splits12]
metadata12 = [doc.metadata for doc in splits12]

In [None]:
embeddings12 = embeddings_model.encode(
    [doc.page_content for doc in splits12], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings12)

In [None]:
ids12 = [str(uuid.uuid4()) for _ in range(len(contents12))]

In [None]:
data.add(
    documents=contents12,
    embeddings=embeddings12,
    metadatas=metadata12,
    ids=ids12
)

In [None]:
append_data(contents12, metadata12, embeddings12)


In [None]:
df

#  <p style="color: orange;"> Document 12 Liens utiles</p>

In [None]:
loader = WebBaseLoader(
    web_paths=("https://fsm.rnu.tn/fra/links",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("links_container","link_item","link_tags")
        )
    ),
)
Liens_utiles = loader.load()

In [None]:
Liens_utiles

## splitting doc 12 into chunks 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits13 = text_splitter.split_documents(Liens_utiles)

In [None]:
splits13

In [None]:
contents13= [doc.page_content for doc in splits13]
metadata13 = [doc.metadata for doc in splits13]

In [None]:
embeddings13 = embeddings_model.encode(
    [doc.page_content for doc in splits13], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings13)

In [None]:
ids13 = [str(uuid.uuid4()) for _ in range(len(contents13))]

In [None]:
data.add(
    documents=contents13,
    embeddings=embeddings13,
    metadatas=metadata13,
    ids=ids13
)

In [None]:
append_data(contents13, metadata13, embeddings13)


In [None]:
df

#  <p style="color: orange;"> Document 13 Departement Chimie </p>

In [None]:
loader = WebBaseLoader(
    web_paths=("https://fsm.rnu.tn/fra/departements/CH/4/chimie",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("content")
        )
    ),
)
Chimie = loader.load()

In [None]:
Chimie

## splitting doc 13 into chunks 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits14 = text_splitter.split_documents(Chimie)

In [None]:
splits14

In [None]:
contents14= [doc.page_content for doc in splits14]
metadata14 = [doc.metadata for doc in splits14]

In [None]:
embeddings14 = embeddings_model.encode(
    [doc.page_content for doc in splits14], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings14)

In [None]:
ids14 = [str(uuid.uuid4()) for _ in range(len(contents14))]

In [None]:
data.add(
    documents=contents14,
    embeddings=embeddings14,
    metadatas=metadata14,
    ids=ids14
)

In [None]:
append_data(contents14, metadata14, embeddings14)


In [None]:
df

#  <p style="color: orange;"> Document 14 Departement Mathematique  </p>

In [None]:
loader = WebBaseLoader(
    web_paths=("https://fsm.rnu.tn/fra/departements/M/1/mathematiques",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("selectEnsFilter")
        )
    ),
)
math = loader.load()

In [None]:
math

## splitting doc 14 into chunks 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits15 = text_splitter.split_documents(math)

In [None]:
splits15

In [None]:
contents15= [doc.page_content for doc in splits15]
metadata15 = [doc.metadata for doc in splits15]

In [None]:
embeddings15 = embeddings_model.encode(
    [doc.page_content for doc in splits15], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings15)

In [None]:
ids15 = [str(uuid.uuid4()) for _ in range(len(contents15))]

In [None]:
data.add(
    documents=contents15,
    embeddings=embeddings15,
    metadatas=metadata15,
    ids=ids15
)

In [None]:
append_data(contents15, metadata15, embeddings15)


In [None]:
df

#  <p style="color: orange;"> Document 15 Departement informatique   </p>

In [None]:
loader = WebBaseLoader(
    web_paths=("https://fsm.rnu.tn/fra/departements/Info/2/informatique",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("selectEnsFilter")
        )
    ),
)
info = loader.load()

In [None]:
info

## splitting doc 15 into chunks 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits16=text_splitter.split_documents(info)

In [None]:
splits16

In [None]:
contents16= [doc.page_content for doc in splits16]
metadata16 = [doc.metadata for doc in splits16]

In [None]:
embeddings16 = embeddings_model.encode(
    [doc.page_content for doc in splits16], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings16)

In [None]:
ids16 = [str(uuid.uuid4()) for _ in range(len(contents16))]

In [None]:
data.add(
    documents=contents16,
    embeddings=embeddings16,
    metadatas=metadata16,
    ids=ids16
)

In [None]:
append_data(contents16, metadata16, embeddings16)


In [None]:
df

#  <p style="color: orange;">Document 16 departement Physiqye   </p>

# Document 16 Departement 16 

In [None]:
loader = WebBaseLoader(
    web_paths=("https://fsm.rnu.tn/fra/departements/PH/3/physique",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("selectEnsFilter")
        )
    ),
)
physique = loader.load()

In [None]:
physique

## splitting doc 16 into chunks 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits17 = text_splitter.split_documents(physique)

In [None]:
splits17

In [None]:
contents17= [doc.page_content for doc in splits17]
metadata17 = [doc.metadata for doc in splits17]

In [None]:
embeddings17 = embeddings_model.encode(
    [doc.page_content for doc in splits17], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings17)

In [None]:
ids17 = [str(uuid.uuid4()) for _ in range(len(contents17))]

In [None]:
data.add(
    documents=contents17,
    embeddings=embeddings17,
    metadatas=metadata17,
    ids=ids17
)

In [None]:
append_data(contents17, metadata17, embeddings17)


In [None]:
df

#  <p style="color: orange;">Document  17  Enseignement Tronc Commun  </p>

In [None]:
loader = WebBaseLoader(
    web_paths=("https://fsm.rnu.tn/fra/departements/ET/5/enseignement-tronc-commun",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("content")
        )
    ),
)
Enseignement_Tronc_Commun = loader.load()

In [None]:
Enseignement_Tronc_Commun

## splitting doc 17 into chunks 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits18 = text_splitter.split_documents(Enseignement_Tronc_Commun)

In [None]:
splits18

In [None]:
contents18= [doc.page_content for doc in splits18]
metadata18 = [doc.metadata for doc in splits18]

In [None]:
embeddings18 = embeddings_model.encode(
    [doc.page_content for doc in splits18], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings18)

In [None]:
ids18 = [str(uuid.uuid4()) for _ in range(len(contents18))]

In [None]:
data.add(
    documents=contents18,
    embeddings=embeddings18,
    metadatas=metadata18,
    ids=ids18
)

In [None]:
append_data(contents18, metadata18, embeddings18)


In [None]:
df

#  <p style="color: orange;">Document 18 اخر بلاغ للتسجيل بالنسبة للسنة الجامعية  </p>


In [None]:
loader = WebBaseLoader(
    web_paths=("https://fsm.rnu.tn/fra/articles/4712/%D8%A7%D8%AE%D8%B1-%D8%A8%D9%84%D8%A7%D8%BA-%D9%84%D9%84%D8%AA%D8%B3%D8%AC%D9%8A%D9%84-%D8%A8%D8%A7%D9%84%D9%86%D8%B3%D8%A8%D8%A9-%D9%84%D9%84%D8%B3%D9%86%D8%A9-%D8%A7%D9%84%D8%AC%D8%A7%D9%85%D8%B9%D9%8A%D8%A9-2024-2025",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("content")
        )
    ),
)
ekher_balegh = loader.load()

In [None]:
ekher_balegh

## splitting doc 18 into chunks 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits19 = text_splitter.split_documents(ekher_balegh)

In [None]:
splits19

In [None]:
contents19= [doc.page_content for doc in splits19]
metadata19 = [doc.metadata for doc in splits19]

In [None]:
embeddings19 = embeddings_model.encode(
    [doc.page_content for doc in splits19], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings19)

In [None]:
ids19 = [str(uuid.uuid4()) for _ in range(len(contents19))]

In [None]:
data.add(
    documents=contents19,
    embeddings=embeddings19,
    metadatas=metadata19,
    ids=ids19
)

In [None]:
append_data(contents19, metadata19, embeddings19)


In [None]:
df

#  <p style="color: orange;">Documents 19 Comptes extranet des étudiants 2024-2025 </p>


In [None]:
loader = WebBaseLoader(
    web_paths=("https://fsm.rnu.tn/fra/articles/4673/comptes-extranet-des-etudiants-2024-2025",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("content")
        )
    ),
)
comptes_extranet_des_etudiants = loader.load()

In [None]:
comptes_extranet_des_etudiants

## splitting doc 19 into chunks 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits20 = text_splitter.split_documents(comptes_extranet_des_etudiants)

In [None]:
splits20

In [None]:
contents20= [doc.page_content for doc in splits20]
metadata20 = [doc.metadata for doc in splits20]

In [None]:
embeddings20 = embeddings_model.encode(
    [doc.page_content for doc in splits20], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings20)

In [None]:
ids20 = [str(uuid.uuid4()) for _ in range(len(contents20))]

In [None]:
data.add(
    documents=contents20,
    embeddings=embeddings20,
    metadatas=metadata20,
    ids=ids20
)

In [None]:
append_data(contents20, metadata20, embeddings20)


In [None]:
df

#  <p style="color: orange;"> Document 20  بلاغ الترسيم للسنة الجامعية </p>


In [None]:
loader = WebBaseLoader(
    web_paths=("https://fsm.rnu.tn/fra/articles/4395/%D8%A8%D9%84%D8%A7%D8%BA-%D8%A7%D9%84%D8%AA%D8%B1%D8%B3%D9%8A%D9%85-%D9%84%D9%84%D8%B3%D9%86%D8%A9-%D8%A7%D9%84%D8%AC%D8%A7%D9%85%D8%B9%D9%8A%D8%A9-2024-2025",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("content")
        )
    ),
)
balegh_tarsim = loader.load()


In [None]:
balegh_tarsim

## splitting doc 20 into chunks 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits21 = text_splitter.split_documents(balegh_tarsim)

In [None]:
splits21

In [None]:
contents21= [doc.page_content for doc in splits21]
metadata21= [doc.metadata for doc in splits21]

In [None]:
embeddings21= embeddings_model.encode(
    [doc.page_content for doc in splits21], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings21)

In [None]:
ids21 = [str(uuid.uuid4()) for _ in range(len(contents21))]

In [None]:
data.add(
    documents=contents21,
    embeddings=embeddings21,
    metadatas=metadata21,
    ids=ids21
)

In [None]:
append_data(contents21, metadata21, embeddings21)


In [None]:
df

#  <p style="color: orange;">Document 21 Fiche de renseignements des diplômés </p>


In [None]:
loader = WebBaseLoader(
    web_paths=("https://fsm.rnu.tn/fra/pages/138/Fiche-de-renseignements-des-dipl%C3%B4m%C3%A9s",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("content")
        )
    ),
)
Fiche_de_renseignements_des_diplome = loader.load()

In [None]:
Fiche_de_renseignements_des_diplome

## splitting doc 21 into chunks 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits22 = text_splitter.split_documents(Fiche_de_renseignements_des_diplome)

In [None]:
splits22

In [None]:
contents22= [doc.page_content for doc in splits22]
metadata22 = [doc.metadata for doc in splits22]

In [None]:
embeddings22 = embeddings_model.encode(
    [doc.page_content for doc in splits22], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings22)

In [None]:
ids22 = [str(uuid.uuid4()) for _ in range(len(contents22))]

In [None]:
data.add(
    documents=contents22,
    embeddings=embeddings22,
    metadatas=metadata22,
    ids=ids22
)

In [None]:
append_data(contents22, metadata22, embeddings22)


In [None]:
df

#  <p style="color: orange;">Document 22 Loi de creation FSM  </p>


In [None]:
loader = WebBaseLoader(
    web_paths=("https://fsm.rnu.tn/fra/pages/1/Loi-de-cr%C3%A9ation",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("content")
        )
    ),
)
loi_de_creation = loader.load()

In [None]:
loi_de_creation

## splitting doc 22 into chunks 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits23 = text_splitter.split_documents(loi_de_creation)

In [None]:
splits23

In [None]:
contents23= [doc.page_content for doc in splits23]
metadata23 = [doc.metadata for doc in splits23]

In [None]:
embeddings23 = embeddings_model.encode(
    [doc.page_content for doc in splits23], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings23)

In [None]:
ids23 = [str(uuid.uuid4()) for _ in range(len(contents23))]

In [None]:
data.add(
    documents=contents23,
    embeddings=embeddings23,
    metadatas=metadata23,
    ids=ids23
)

In [None]:
append_data(contents23, metadata23, embeddings23)


In [None]:
df

#  <p style="color: orange;">Document 23 loi en chiffre  </p>


In [None]:
loader = WebBaseLoader(
    web_paths=("https://fsm.rnu.tn/fra/pages/3/En-chiffres",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("content")
        )
    ),
)
loi_en_chiffre = loader.load()

In [None]:
loi_en_chiffre

## splitting doc 23 into chunks 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits24 = text_splitter.split_documents(loi_en_chiffre)

In [None]:
splits24

In [None]:
contents24= [doc.page_content for doc in splits24]
metadata24 = [doc.metadata for doc in splits24]

In [None]:
embeddings24 = embeddings_model.encode(
    [doc.page_content for doc in splits24], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings24)

In [None]:
ids24 = [str(uuid.uuid4()) for _ in range(len(contents24))]

In [None]:
data.add(
    documents=contents24,
    embeddings=embeddings24,
    metadatas=metadata24,
    ids=ids24
)

In [None]:
append_data(contents24, metadata24, embeddings24)


In [None]:
df

# LICENCE

#  <p style="color: orange;">Document 24 PARCOURS LMD Mathématiques Appliquées</p>


In [None]:
loader = WebBaseLoader(
    web_paths=("http://www.parcours-lmd.salima.tn/listeueetab.php?parc=ABhRHFxzAmNUZVIoBj4ENQYgX2sBPA==&etab=VjJQYQk7",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("center")
        )
    ),
)
parcours_math_appli = loader.load()

In [None]:
parcours_math_appli

## splitting doc 24 into chunks 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits25 = text_splitter.split_documents(parcours_math_appli)

In [None]:
splits25

In [None]:
contents25= [doc.page_content for doc in splits25]
metadata25 = [doc.metadata for doc in splits25]

In [None]:
embeddings25 = embeddings_model.encode(
    [doc.page_content for doc in splits25], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings25)

In [None]:
ids25 = [str(uuid.uuid4()) for _ in range(len(contents25))]

In [None]:
data.add(
    documents=contents25,
    embeddings=embeddings25,
    metadatas=metadata25,
    ids=ids25
)

In [None]:
append_data(contents25, metadata25, embeddings25)


In [None]:
df

#  <p style="color: orange;"> Document 25 parcours lmd Computer Science</p>


In [None]:
loader = WebBaseLoader(
    web_paths=("http://www.parcours-lmd.salima.tn/listeueetab.php?parc=UkpTHlxzUzJXZlctDjJTYFZwDDI=&etab=VjJZaAg6",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("center")
        )
    ),
)
parcours_computer_science = loader.load()

In [None]:
parcours_computer_science

## splitting doc 25 into chunks 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits26 = text_splitter.split_documents(parcours_computer_science)

In [None]:
splits26

In [None]:
contents26= [doc.page_content for doc in splits26]
metadata26= [doc.metadata for doc in splits26]

In [None]:
embeddings26 = embeddings_model.encode(
    [doc.page_content for doc in splits26], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings26)

In [None]:
ids26 = [str(uuid.uuid4()) for _ in range(len(contents26))]

In [None]:
data.add(
    documents=contents26,
    embeddings=embeddings26,
    metadatas=metadata26,
    ids=ids26
)

In [None]:
append_data(contents26, metadata26, embeddings26)


In [None]:
df

#  <p style="color: orange;"> Document 26 Parcours LMD Mesures et Instrumentation</p>


In [None]:
loader = WebBaseLoader(
    web_paths=("http://www.parcours-lmd.salima.tn/listeueetab.php?parc=W0NXGlp1UjNWZwN5BzkHMVN1DzsBPA==&etab=BGBYaQw+",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("center")
        )
    ),
)
parcours_Mesures = loader.load()

In [None]:
parcours_Mesures

## spitting doc 26 inti chunks 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits27 = text_splitter.split_documents(parcours_Mesures)

In [None]:
splits27

In [None]:
contents27= [doc.page_content for doc in splits27]
metadata27= [doc.metadata for doc in splits27]

In [None]:
embeddings27 = embeddings_model.encode(
    [doc.page_content for doc in splits27], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings27)

In [None]:
ids27 = [str(uuid.uuid4()) for _ in range(len(contents27))]

In [None]:
data.add(
    documents=contents27,
    embeddings=embeddings27,
    metadatas=metadata27,
    ids=ids27
)

In [None]:
append_data(contents27, metadata27, embeddings27)


In [None]:
df

#  <p style="color: orange;">Document 27 Parcours LMD Physique </p>


In [None]:
loader = WebBaseLoader(
    web_paths=("http://www.parcours-lmd.salima.tn/listeueetab.php?parc=W0NZFFp1UjNcbVshDjAENlJ0X2tTbg==&etab=AWUDMl9t",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("center")
        )
    ),
)
parcours_physique = loader.load()

In [None]:
parcours_physique

## splitting doc 27 into chunks 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits28 = text_splitter.split_documents(parcours_physique)

In [None]:
splits28

In [None]:
contents28= [doc.page_content for doc in splits28]
metadata28= [doc.metadata for doc in splits28]

In [None]:
embeddings28 = embeddings_model.encode(
    [doc.page_content for doc in splits28], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings28)

In [None]:
ids28 = [str(uuid.uuid4()) for _ in range(len(contents28))]

In [None]:
data.add(
    documents=contents28,
    embeddings=embeddings28,
    metadatas=metadata28,
    ids=ids28
)

In [None]:
append_data(contents28, metadata28, embeddings28)


In [None]:
df

#  <p style="color: orange;">Document 28 Parcours LMD chimie  </p>


In [None]:
loader = WebBaseLoader(
    web_paths=("http://www.parcours-lmd.salima.tn/listeueetab.php?parc=W0NYFV9wVDVcbQF7BzkKPQQiCz8HOg==&etab=B2NUZQAy",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("center")
        )
    ),
)
parcours_chimie = loader.load()

In [None]:
parcours_chimie

## splitting doc 28 into chunks 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits29= text_splitter.split_documents(parcours_chimie)

In [None]:
splits29

In [None]:
contents29= [doc.page_content for doc in splits29]
metadata29= [doc.metadata for doc in splits29]

In [None]:
embeddings29 = embeddings_model.encode(
    [doc.page_content for doc in splits29], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings29)

In [None]:
ids29 = [str(uuid.uuid4()) for _ in range(len(contents29))]

In [None]:
data.add(
    documents=contents29,
    embeddings=embeddings29,
    metadatas=metadata29,
    ids=ids29
)

In [None]:
append_data(contents29, metadata29, embeddings29)


In [None]:
df

#  <p style="color: orange;"> Document 29 Parcours LMD Physique-Chimie</p>


In [None]:
loader = WebBaseLoader(
    web_paths=("http://www.parcours-lmd.salima.tn/listeueetab.php?parc=Bh4HSlh3VTQGN1ctVWsAMVJ0DjA=&etab=VjJZaA0/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("center")
        )
    ),
)
parcours_physique_chimie = loader.load()

In [None]:
parcours_physique_chimie

## splitting doc 29 into chunks 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits30= text_splitter.split_documents(parcours_physique_chimie)

In [None]:
splits30

In [None]:
contents30= [doc.page_content for doc in splits30]
metadata30= [doc.metadata for doc in splits30]

In [None]:
embeddings30 = embeddings_model.encode(
    [doc.page_content for doc in splits30], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings30)

In [None]:
ids30 = [str(uuid.uuid4()) for _ in range(len(contents30))]

In [None]:
data.add(
    documents=contents30,
    embeddings=embeddings30,
    metadatas=metadata30,
    ids=ids30
)

In [None]:
append_data(contents30, metadata30, embeddings30)


In [None]:
df

#  <p style="color: orange;">Document 30 Document de demande de diplome </p>


In [None]:
loader = WebBaseLoader(
    web_paths=("https://fsm.rnu.tn/fra/articles/1249/demande-de-diplomes",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("content")
        )
    ),
)
doc_demande_de_diplome = loader.load()

In [None]:
doc_demande_de_diplome

## splitting doc 30 into chunks 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits31 = text_splitter.split_documents(doc_demande_de_diplome)

In [None]:
splits31

In [None]:
contents31= [doc.page_content for doc in splits31]
metadata31= [doc.metadata for doc in splits31]

In [None]:
embeddings31 = embeddings_model.encode(
    [doc.page_content for doc in splits31], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings31)

In [None]:
ids31 = [str(uuid.uuid4()) for _ in range(len(contents31))]

In [None]:
data.add(
    documents=contents31,
    embeddings=embeddings31,
    metadatas=metadata31,
    ids=ids31
)

In [None]:
append_data(contents31, metadata31, embeddings31)


In [None]:
df

#  <p style="color: orange;">Document 31 INFORMATION sur master rechereche mathematique  </p>


In [None]:
loader = WebBaseLoader(
    web_paths=("https://um.rnu.tn/fr/formations/formation-lmd/master/mat%C3%A8re-de-recherche-en-math%C3%A9matiques-fsm/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("single-post-content single-content")
        )
    ),
)
info_supp_mastere_math = loader.load()

In [None]:
info_supp_mastere_math

## spitting doc 31 into chunks 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
splits32 = text_splitter.split_documents(info_supp_mastere_math)

In [None]:
splits32

In [None]:
contents32= [doc.page_content for doc in splits32]
metadata32 = [doc.metadata for doc in splits32]

In [None]:
embeddings32 = embeddings_model.encode(
    [doc.page_content for doc in splits32], 
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings32)

In [None]:
ids32 = [str(uuid.uuid4()) for _ in range(len(contents32))]

In [None]:
data.add(
    documents=contents32,
    embeddings=embeddings32,
    metadatas=metadata32,
    ids=ids32
)

In [None]:
append_data(contents32, metadata32, embeddings32)


In [None]:
df

# putting all lists in one dictionary 

In [None]:
# data = {
#     "Masteres_Procedure_de_Depot": Masteres_Procedure_de_Depot,
#     "Theses_Inscriptions_etProcedure_de_Depot": Theses_Inscriptions_etProcedure_de_Depot,
#     "ورشة_بعنوان_أهمية_الصحة_النفسية": warcha_mental_health,
#     "festival_de_la_creativite_estudiantinet": festival_de_la_creativite_estudiantinet,
#     "Bourse_alternance": Bourse_alternance,
#     "the_indian_council_for_cultural_relations": the_indian_council_for_cultural_relations,
#     "Règlement_intérieur_des_examens": Règlement_intérieur_des_examens,
#     "Stages_PFE":Stages_PFE,
#     "Procédure_de_déroulement_des_stages_facultatifs": Procédure_de_déroulement_des_stages_facultatifs,
#     "Procédure_de_déroulement_des_stages_obligatoires": Procédure_de_déroulement_des_stages_obligatoires,
#     "Partenariat_international": Partenariat_international,
#     "Communication": Communication,
#     "Liens_utiles": Liens_utiles,
#     "Chimie": Chimie,
#     "math": math,
#     "info": info,
#     "physique": physique,
#     "Enseignement_Tronc_Commun": Enseignement_Tronc_Commun,
#     "ekher_balegh": ekher_balegh,
#     "comptes_extranet_des_etudiants": comptes_extranet_des_etudiants,
#     "elements": elements,
#     "Fiche_de_renseignements_des_diplome": Fiche_de_renseignements_des_diplome,
#     "loi_de_creation": loi_de_creation,
#     "loi_en_chiffre": loi_en_chiffre,
#     "parcours_math_appli": parcours_math_appli,
#     "parcours_computer_science":parcours_computer_science,
#     "parcours_Mesures": parcours_Mesures,
#     "parcours_physique": parcours_physique,
#     "parcours_chimie":parcours_chimie,
#     "parcours_physique_chimie": parcours_physique_chimie,
#     "doc_demande_de_diplome": doc_demande_de_diplome,
#     "info_supp_mastere_math": info_supp_mastere_math
# }


# Visuliazing chromadb 

In [None]:
data = data.get(include=['embeddings'])
print(data)


In [None]:
if 'embeddings' in data:
    embeddings_array = np.array(data['embeddings'])
    print("Embeddings shape:", embeddings_array.shape)
else:
    print("No embeddings found in vectorstore.")


In [None]:
if embeddings_array.size > 0: 
    pca = PCA(n_components=2)
    embeddings_2d = pca.fit_transform(embeddings_array)

    # Plot embeddings
    plt.figure(figsize=(8, 6))
    plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], alpha=0.7)
    plt.xlabel("PCA 1")
    plt.ylabel("PCA 2")
    plt.title("2D Visualization of Embeddings")
    plt.show()
else:
    print("No embeddings available for PCA visualization.")


# Manully testing to retrive 1st attempt just checking 👌

In [None]:
data = chroma_client.get_collection(name="my_dataaaa")

In [None]:
query_embedding =  embeddings_model.encode("Quelles sont les documents de stage obligatoire?")

results = data.query(
    query_embeddings=[query_embedding],  
    n_results=100
)

In [None]:
for doc in results['documents']:
    print(doc)

In [None]:
chroma_client = chromadb.PersistentClient(path="chroma_db")
collections = chroma_client.list_collections()
print("Available collections:", collections)  
if "my_dataaaa" in collections:
    collection = chroma_client.get_collection(name="my_dataaaa")
    print(" Successfully loaded collection:", collection)
else:
    print("Collection 'my_dataaaa' does not exist. Available collections:", collections)
