# Imports et installation de bibliothèques necéssaires

In [21]:
%pip install langchain langchain_core langchain_community langchain_pinecone langchain_huggingface langchainhub langchain_pinecone langchain-google-genai pinecone-client jq transformers pydantic
%pip install llama-index-core llama-index datasets llama-index-vector-stores-pinecone llama-index-embeddings-huggingface
%pip install --upgrade mistralai

Collecting langchain-google-genai
  Downloading langchain_google_genai-1.0.8-py3-none-any.whl.metadata (3.8 kB)
Collecting pinecone-client
  Using cached pinecone_client-5.0.0-py3-none-any.whl.metadata (19 kB)
Downloading langchain_google_genai-1.0.8-py3-none-any.whl (38 kB)
Using cached pinecone_client-5.0.0-py3-none-any.whl (244 kB)
Installing collected packages: langchain-google-genai, pinecone-client
  Attempting uninstall: pinecone-client
    Found existing installation: pinecone-client 3.2.2
    Uninstalling pinecone-client-3.2.2:
      Successfully uninstalled pinecone-client-3.2.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llama-index-vector-stores-pinecone 0.1.8 requires pinecone-client<4.0.0,>=3.0.2, but you have pinecone-client 5.0.0 which is incompatible.[0m[31m
[0mSuccessfully installed langchain-google-genai-1.0.8 pinecone-client-5.0.

In [22]:
import os
import glob
import random
from google.colab import drive
from tqdm.notebook import trange, tqdm

# Mistral
import requests
from langchain.chains.llm import LLMChain
from langchain_core.prompts import ChatPromptTemplate
from langchain.memory import ConversationBufferMemory
from mistralai.models.chat_completion import ChatMessage
from langchain.chains import ConversationalRetrievalChain
from langchain_core.output_parsers import StrOutputParser
from langchain_mistralai.chat_models import ChatMistralAI

# Pinecone
from pinecone import Pinecone, ServerlessSpec

# Langchain
from langchain import hub
from langchain import HuggingFaceHub
from langchain.chains import ConversationChain
from langchain_pinecone import PineconeVectorStore  ##
from langchain.memory import ConversationBufferMemory
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chains import ConversationalRetrievalChain
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# # Llama index
# from llama_index.core.extractors import TitleExtractor
# from llama_index.core.ingestion import IngestionPipeline
# from llama_index.core.node_parser import SentenceSplitter
# from llama_index.core import StorageContext, VectorStoreIndex
# from llama_index.core.storage.docstore import SimpleDocumentStore
# from llama_index.vector_stores.pinecone import PineconeVectorStore ##
# # from llama_index.embeddings.huggingface import HuggingFaceEmbedding
# from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

# Définition du modèle d'embeddings

In [3]:
model_name = "mixedbread-ai/mxbai-embed-large-v1"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(model_name=model_name,
                                   model_kwargs=model_kwargs,
                                   encode_kwargs=encode_kwargs)

In [4]:
embeddedtext = embeddings.embed_documents("This is my first text")
print(len(embeddedtext[0]))

1024


# Création du Vector Store

In [5]:
os.environ['PINECONE_API_KEY'] = "a1ae148e-e273-4cc8-895e-b1135d91b65f"

pc = Pinecone()

if "poem-gen-rag" in pc.list_indexes().names():
    pc.delete_index("poem-gen-rag")

index = pc.create_index(name="poem-gen-rag",
                        dimension=1024,
                        metric="cosine",
                        spec=ServerlessSpec(cloud='aws',
                                            region='us-east-1'))

# Récupération des textes

In [6]:
## Récupération des fichiers via le drive
drive.mount('/content/drive')
folder_path = '/content/drive/MyDrive/projet/poems_dataset'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
files = []
for f in glob.glob(folder_path+"/*/*/*"):
  files.append(f)
print("File names :",files[:10])
print(len(files))

File names : ['/content/drive/MyDrive/projet/poems_dataset/topics/friend/FriendPoemsAParanaeticallOrAdvisiveVersetoHisFriendMrJohnWicksPoembyRobertHerrick.txt', '/content/drive/MyDrive/projet/poems_dataset/topics/friend/FriendPoemsRememberingAFriendPoembyPdishere.txt', '/content/drive/MyDrive/projet/poems_dataset/topics/friend/FriendPoemsFriendYourWhiteBeardSweepsTheGroundPoembyStephenCrane.txt', '/content/drive/MyDrive/projet/poems_dataset/topics/friend/FriendPoemsADepartedFriendPoembyJuliaAnnMoore.txt', '/content/drive/MyDrive/projet/poems_dataset/topics/friend/FriendPoemsAFriendPoembyElizabethAnne.txt', '/content/drive/MyDrive/projet/poems_dataset/topics/friend/FriendPoemsFriend1ShireenIsalMyDearFriendPoembyDrGeetaRadhakrishnaMenon.txt', '/content/drive/MyDrive/projet/poems_dataset/topics/friend/FriendPoemsToAFriendWhoseWorkHasComeToNothingPoembyWilliamButlerYeats.txt', '/content/drive/MyDrive/projet/poems_dataset/topics/friend/FriendPoemsMyBestFriendPoembyGordonWhittaker.txt', '/co

In [9]:
## Création d'un dataframe contenant les données pour la création de l'index
dict_data = {"id" : [], "values" : [], "metadata": []}
random_files = random.sample(files, 200)

for i in range(len(random_files)):
  embed = embeddings.embed_documents(random_files[i])
  for j in range(len(embed)):
    id = str(i)+"-"+str(j)
    value = embed[j]
    meta = {"chunk" : str(j), "file_path" : random_files[i]}

    dict_data["id"].append(id)
    dict_data["values"].append(value)
    dict_data["metadata"].append(meta)

In [14]:
print(len(dict_data["values"][0]))
print(dict_data["values"][0])

1024
[-0.9735565781593323, -0.9750276803970337, -0.32144665718078613, 0.7462085485458374, 0.2115674465894699, -0.255120187997818, -0.25930020213127136, 0.5649267435073853, 1.0722979307174683, 1.1430524587631226, 1.0940605401992798, -0.33823561668395996, -0.9594467282295227, -0.5162293910980225, -0.5997484922409058, -0.1477028727531433, -0.002717037219554186, -0.9332543015480042, -0.39795592427253723, -0.47719377279281616, 0.00675997044891119, -0.2835976183414459, -1.107649326324463, -0.5968685746192932, -0.46974635124206543, 1.2594314813613892, 0.5211606025695801, 0.01742640882730484, 0.5582972764968872, 0.9824334979057312, -0.3107556402683258, -0.5873059034347534, -0.32187142968177795, 0.5055993795394897, 0.4566079080104828, -0.09137796610593796, 0.6129013299942017, -0.21630634367465973, -0.5318903923034668, -0.7552381753921509, 0.22839681804180145, -0.5012357234954834, 1.2031043767929077, -0.7302435636520386, -0.4445190727710724, 0.2854127287864685, -1.1035447120666504, -0.5274354815

In [15]:
import pandas as pd
df = pd.DataFrame.from_dict(dict_data)

# Ajout des textes au vector store

In [16]:
index = pc.Index("poem-gen-rag")
index.upsert_from_dataframe(df, batch_size=50)

vector_store = PineconeVectorStore(index, embeddings)

sending upsert requests:   0%|          | 0/22306 [00:00<?, ?it/s]

# Test du RAG avec génération d'un poème

In [19]:
vector_store.similarity_search("Poem about summer")



[]

### Création de chaînes avec un modèle HuggingFace

In [39]:
os.environ['HUGGINGFACEHUB_API_TOKEN'] = "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"

model_name = "google/flan-t5-small"
llm = HuggingFaceHub(repo_id=model_name,
                     model_kwargs={"temperature":0.01})

In [40]:
print(llm.invoke("Can you write a small poem about summer ?"))

i love summer i love summer i love summer i love summer i love


In [41]:
query = "Can you write a small poem about summer  ?"

prompt = f""" You are an assistant agent expert in poems, you can answer to any request

{query}

### Response:
"""

response = llm.predict(prompt)
print(response)

i love summer


In [42]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

prompt_template="""Use the following poems as insiration to answer the human question at the end.
If you don't know the answer, respond "I can't write a poem like this", don't try to make up an answer.

the context is : {context}
Question: {question}

Answer:"""

qa_prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template)

qa = RetrievalQA.from_chain_type(llm=llm,
                                 retriever=vector_store.as_retriever(),
                                 chain_type="stuff",
                                 chain_type_kwargs={"prompt": qa_prompt},
                                 return_source_documents=True)

query = "Can you write a small poem about summer  ?"
result = qa.invoke({"query":query})
print(result["result"])



I can't write a poem like this.


### Création de chaînes avec un modèle Gemini

In [43]:
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_google_genai import GoogleGenerativeAI

os.environ["GOOGLE_CSE_ID"] = "c118258383fab4969"
os.environ["GOOGLE_API_KEY"] = "AIzaSyDI4gpwnwFsta6WkVsnRrcJxzZzgHHSunE"

llm_genai = GoogleGenerativeAI(model="gemini-pro")

In [44]:
print(llm_genai.invoke("Can you write a poem about summer ?"))

**Ode to Summer's Embrace**

Oh, summer, season of golden grace,
When nature awakens, with vibrant pace.
The sunbeams dance upon the verdant scene,
A symphony of warmth, a radiant sheen.

The flowers bloom in hues of rainbow's might,
A painted canvas, a breathtaking sight.
Their petals unfurl, soft and delicate,
A fragrant perfume, an enchanting fate.

The trees reach skyward, their branches sway,
A gentle rustling, a soothing lay.
Their emerald leaves shimmer in the breeze,
A leafy haven, a sanctuary of ease.

Birdsong fills the air, a joyful choir,
Their melodies soaring, hearts set afire.
From dawn's awakening to twilight's fall,
Their voices echo through the woodland's thrall.

The rivers flow, a silver serpent's gleam,
Reflecting sunlight, like a waking dream.
Their waters ripple, a gentle caress,
Inviting us to cool our summer stress.

The fields are painted with a golden hue,
As crops ripen, a bountiful view.
Wheat stalks sway, heavy with nature's yield,
A promise of plenty, a f

In [50]:
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

chain = ConversationalRetrievalChain.from_llm(llm=llm_genai,
                                              retriever=vector_store.as_retriever(),
                                              memory=memory)

query = "Can you write a haiku about summer ?"
response = chain.invoke({"question": query})

