In [1]:
!pip install -q langchain

In [2]:
!pip install -q langchain-google-genai google-generativeai pypdf

In [3]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv, find_dotenv
# from google.colab import userdata
import os
import getpass
try:
  if "GOOGLE_API_KEY" not in os.environ:
    # os.environ["GOOGLE_API_KEY"] = userdata.get("GOOGLE_API_KEY")
    load_dotenv(find_dotenv(), override=True)
except Exception as e:
  print(e)
  import getpass
  os.environ["GOOGLE_API_KEY"] = getpass.getpass("GOOGLE_API_KEY")


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
print(os.environ.get("GOOGLE_API_KEY")[:3])

AIz


In [5]:
from typing import List
def load_docs_locally(files:List[str]=[]):
    global BASE_DIR
    from pprint import pprint
    import os
    os.chdir(os.path.join(BASE_DIR,"files/"))
    print(f"current directory: {os.getcwd()}")
    files = [file for file in os.listdir()] if not files else files
    pprint(files)

    data = []

    for file in files:
        _, extension = os.path.splitext(file)
        if not file.startswith("."):
          match extension:
              case ".pdf":
                  from langchain.document_loaders import PyPDFLoader

                  loader = PyPDFLoader(file)
                  print(f"loading pdf {file} ....")
              case ".txt":
                  from langchain.document_loaders import TextLoader
                  loader = TextLoader(file, encoding="utf-8")
                  print(f"loading text {file} ....")
              case ".docx":
                  from langchain.document_loaders import Docx2textLoader
                  loader = Docx2textLoader(file)
                  print(f"loading docx {file} ....")
              case _:
                  print(f"no such available format such as {extension}")


        data += loader.load()
    os.chdir("../")
    pprint(data)
    return data

In [6]:
os.makedirs("files/", exist_ok=True)

In [7]:
def download_file(url:str,filename:str):
    import requests,os
    binary_file = requests.get(url).content
    _,extension = os.path.splitext(url)

    with open(f"files/{filename}{extension}", 'wb') as f:
        f.write(binary_file)

    print(f"done downloading {filename}{extension}")
    return f"files/{filename}{extension}"

In [8]:
def load_docs(docs_urls=["https://pypi.org/"]):
    from langchain.document_loaders.async_html import AsyncHtmlLoader
    print("loading started....")
    loader = AsyncHtmlLoader(docs_urls)
    documents = loader.load()
    return documents

In [9]:
def clean_html(html_page:str, title:str):
    from pprint import pprint
    from bs4 import BeautifulSoup
    parser = BeautifulSoup(html_page, "html.parser")
    # pprint(parser.prettify())
    with open(f"files/{title}.txt", "w",encoding="utf-8") as f:
        for string in parser.strings:
            if string !="\n":
                f.write(string.strip())
                f.write("\n")

In [10]:
from typing import List
def mass_download(urls:List[str]):
  file_titles = []
  html_pages = load_docs(urls)
  for i,html_page in enumerate(html_pages):
      cleaned_file_title = (
          urls[i]
          .replace("/", "_")
          .replace(".", "_")
          .replace("-", "_")
          .replace("https:", "")
          .replace("dz", "")
          .replace("net", "")
          .replace("com", "")
          .replace("org", "")
          .replace("edu", "")
          .strip("_")
      )
      clean_html(
          html_page.page_content,
          cleaned_file_title
      )
      file_titles.append(cleaned_file_title)
  return file_titles

In [11]:
urls = [
    "https://fsciences.univ-setif.dz/main_page/english",
]
mass_download(urls)

loading started....


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.32it/s]


['fsciences_univ_setif__main_page_english']

In [12]:
os.chdir(os.getcwd())

In [13]:
os.getcwd()

'c:\\Users\\Yahia\\Desktop\\langchain_bot'

In [15]:
from pprint import pprint
BASE_DIR=os.getcwd()
docs = load_docs_locally()
pprint(docs)

current directory: c:\Users\Yahia\Desktop\langchain_bot\files
['Banque_FR.pdf', 'fsciences_univ_setif__main_page_english.txt']
loading pdf Banque_FR.pdf ....
loading text fsciences_univ_setif__main_page_english.txt ....
[Document(page_content="Q 1 : Existe -t-il une spécialisation en génie logiciel dans notre collège  ? \nR 1 : Oui, c'était disponible mais plus maintenant faute d'étudiants  \n \nQ 2 : Y a -t-il des possibilités de formation ou de mise en pratique pendant la période \nd'études ?  \nR 2 : Oui, il y en a. Les universités proposent souvent des programmes de formation avec \ndes entreprises ou des institutions locales pour permettre aux étudiants d'acquérir une \nexpérience pratique dans différents domaines de l'informatique.  \n \nQ3 : Quelles sont les mat ières principales du programme  ? \nR4 : les matières de base comprennent, entre autres, la programmation, les bases de \ndonnées, les réseaux, la sécurité de l'information, l'intelligence artificielle, le \ndéveloppemen

In [16]:
def chunk_data(docs):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=0)
  text = "\n".join([doc.page_content for doc in docs])
  # print(text)
  chunks = text_splitter.split_text(text)
  return chunks

In [17]:
chunks = chunk_data(docs)
print(f"{len(chunks)} chunk")
# pprint(chunks)

136 chunk


In [18]:
def insert_or_create_index(index_name, chunks):
    import pinecone
    from pinecone import PodSpec
    from langchain_community.vectorstores.pinecone import Pinecone
    from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings

    pc = pinecone.Pinecone()
    embedding = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
    if index_name in pc.list_indexes().names():
        print(f"start fetching from {index_name}!")
        vector_store = Pinecone.from_existing_index(index_name, embedding)
        print(f"done fetching from {index_name}!")
    else:
        print(f"start creating from {index_name}!")
        pc.create_index(
            name=index_name,
            dimension=768,
            metric="cosine",
            spec=PodSpec(environment="gcp-starter"),
        )
        vector_store = Pinecone.from_texts(chunks, embedding, index_name=index_name)
        print(f"done creation of {index_name}!")
    return vector_store

In [19]:
def delete_index(index_name="all"):
    from pinecone import Pinecone

    pc = Pinecone()
    if index_name == "all":
        for index in pc.list_indexes().names():
            pc.delete_index(index)
    else:
        pc.delete_index(index_name)
    print(f"deleted {index_name}")

In [20]:
def ask_question(query, vector_store):
    from langchain.prompts import PromptTemplate
    from langchain_google_genai import ChatGoogleGenerativeAI
    from langchain.chains import RetrievalQA

    template = """
  use the following pieces of context to answer the question at the end, translate the answer to arabic. if you don't the answer just say that you don't know the answer, don't try to make up an answer, keep the answer as concise as possible
  {context}
  Question:{question}
  """
    QA_CHAIN_TEMPLATE = PromptTemplate.from_template(template)
    pinecone_chain = RetrievalQA.from_chain_type(
        llm=ChatGoogleGenerativeAI(model="gemini-pro", temperature=1),
        retriever=vector_store.as_retriever(
            search_type="similarity", search_kwargs={"k": 5}
        ),
        return_source_documents=True,
        chain_type_kwargs={"prompt": QA_CHAIN_TEMPLATE},
        verbose=True,
    )

    response = pinecone_chain({"query":query})
    pprint(response)

In [21]:
delete_index()
vector_store = insert_or_create_index("test-index", chunks)

pprint(vector_store)

deleted all
start creating from test-index!
done creation of test-index!
<langchain_community.vectorstores.pinecone.Pinecone object at 0x00000134E34EB110>


In [21]:
ask_question(input("give your query"), vector_store)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
{'query': 'what did Zelenskyy said in his speech',
 'result': 'According to the context, Zelenskyy said in his speech to the '
           'European Parliament "Light will win over darkness".\n'
           'نور سينتصر على الظلام',
 'source_documents': [Document(page_content='From President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.'),
                      Document(page_content='And a proud Ukrainian people, who have known 30 years  of independence, have repeatedly shown that they will not tolerate anyone who tries to take their country backwards.'),
                      Document(page_content='In this struggle as President Zelenskyy said in his speech to the European Parliament “Light will win over darkness.” The Ukrainian Ambassador to the United States is here tonight.'),
                      Document(page_content='And we remain clear-eyed. The Ukra

In [22]:
def searching_with_custom_prompt(query, vector_store, search_type="llm"):
    from langchain.chains import ConversationalRetrievalChain
    from langchain_google_genai import GoogleGenerativeAI
    from langchain.memory import ConversationBufferMemory, FileChatMessageHistory
    from langchain.prompts import (
        ChatPromptTemplate,
        HumanMessagePromptTemplate,
        SystemMessagePromptTemplate,
    )

    memory = ConversationBufferMemory(
        memory_key="chat_history",
        return_messages=True,
        chat_memory=FileChatMessageHistory("chat_history.json"),
        input_key="question",
        output_key="answer",
    )

    system_message_prompt = """
  use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
  Context: ```{context}```
  """

    user_message_prompt = """
  Question: ```{question}```
  Chat History: ```{chat_history}```
  """

    messages = [
        SystemMessagePromptTemplate.from_template(system_message_prompt),
        HumanMessagePromptTemplate.from_template(user_message_prompt),
    ]

    qa_prompt = ChatPromptTemplate.from_messages(messages)
    llm = GoogleGenerativeAI(model="gemini-pro")
    chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        chain_type="stuff",
        retriever=vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5}),
        memory=memory,
        combine_docs_chain_kwargs={"prompt": qa_prompt},
        verbose=True,
    )
    return chain.invoke({"question": query})

In [23]:
while True:
    query = input("Enter your query: ")
    if query.lower() in ["exit", "quit", "/q", "\\q"]:
        break
    result = searching_with_custom_prompt(query, vector_store, search_type="llm")
    import time

    time.sleep(2)
    print(result["answer"])



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: 
  use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
  Context: ```GENERAL SECRETARY OF THE FACULTY
PHONE LINES
Read more
2015-2024 Faculté des Sciences, Tous droits réservés.
×

Q38 : Combien de matières sont enseignées en troisième année d'informatique ?  
R : 4 matières.  
 
Q39 : Comment calculer la moyenne ?  
R : (0,4 * travaux pratiques + 0,6 * note de l'examen) * coefficient de la matière.

Faculté des Sciences
Toggle navigation
UNIVERSITY SÉTIF 1
FERHAT ABBAS
FACULTY OF
SCIENCES

R : En accédant à votre compte sur l'application Moodle et en vous rendant à la page de 
choix pour sélectionner la spécialisation désirée parmi celles disponibles.

supplémentaire ou la recherche, tels que des ateliers, des projets de recherche, des