# Chatbot

### Installations

In [None]:
!pip install langchain
!pip install pypdf
!pip install rapidocr-onnxruntime
!pip install langchain pypdf openai chromadb tiktoken docx2txt

!pip install sentence-transformers
!pip install accelerate
! pip install langchain
! pip install chromadb

In [1]:
from huggingface_hub import login
login('')

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [2]:
import os

In [3]:
os.environ['HuggingFaceHub_API_Token']= ''

In [4]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:1024"

In [5]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain import HuggingFacePipeline
from transformers import AutoTokenizer,pipeline
import transformers
import torch
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings


In [6]:
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',model_kwargs={'device': 'cuda'})

In [7]:
model = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model)

pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
    max_length=1024,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,

    eos_token_id=tokenizer.eos_token_id
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
llm=HuggingFacePipeline(pipeline=pipeline,model_kwargs={'temperature':0})

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("/content/drive/MyDrive/Datasets/dataforchatbot_arb.pdf")
arb_doc = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
arb_doc = text_splitter.split_documents(arb_doc)


### Translation

In [18]:
model = "facebook/mbart-large-50-many-to-many-mmt"

from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

model_trans = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

tokenizer.src_lang = "ar_AR"


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [19]:
eng_doc = arb_doc.copy()
for i in range(len( arb_doc)):
    encoded_ar = tokenizer(arb_doc[i].page_content, return_tensors="pt")

    generated_tokens= model_trans.generate(
        **encoded_ar,
        forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"]
        )

    output = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    eng_doc[i].page_content  = ' '.join(output)



In [20]:
## for translate prompt

def translate_from_arbic(text):
    tokenizer.src_lang = "ar_AR"
    encoded_ar = tokenizer(text, return_tensors="pt")
    generated_tokens = model_trans.generate(
        **encoded_ar,
        forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"]
    )
    eng = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    return ' '.join(eng)



# for translate model output to user

def translate_from_english(text):
    tokenizer.src_lang = "en_XX"
    encoded_ar = tokenizer(text, return_tensors="pt")
    generated_tokens = model_trans.generate(
        **encoded_ar,
        forced_bos_token_id=tokenizer.lang_code_to_id["ar_AR"]
    )
    arb = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    return ' '.join(arb)



In [21]:
eng_doc

[Document(page_content='The importance of sport is considered to be one of the most important ways of maintaining physical health, and the following are the main benefits of sport to the human body1, and to avoid obesity, and to achieve this, attention should also be paid to dietary supplements, by equalizing or comparing saturated calories with or without the calories that the individual consumes. Prevention of heart disease: Sport helps to maintain heart health, by regularly boosting the circulation, raising oxygen levels in the bloodstream; helping to reduce the risk of heart disease, such as cholesterol rise, coronary artery disease, and heart attacks, as well as contributing to lowering blood pressure and three-fat levels. Regulation of blood sugar: Sport regulates insulin levels and improves its performance within the body, helping to control blood sugar levels, so that regular exercise reduces the incidence of certain diseases; such as type 2 diabetes, metabolic', metadata={'sou

In [22]:
from langchain.vectorstores import Chroma
vectordb = Chroma.from_documents(
  eng_doc,
  embedding=embeddings,
  persist_directory='./data/arb2'
)
vectordb.persist()

### Model without prompt

In [23]:
qa_chain_llama2 = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectordb.as_retriever(search_kwargs={'k':2}), # k is the
    return_source_documents=True
)


In [26]:

chat_history = []
while True:

    query = input('السؤال: ')
    query_en = translate_from_arbic(query)
    # give us a way to exit the script
    if query == "خروج" or query == "اخرج":
        print('Exiting')
        #sys.exit()
        break

    result = qa_chain_llama2({'query': query_en, 'chat_history': chat_history})
    arb_res = translate_from_english(result['result'])

    print('الاجابة: ' + arb_res + "\n\n")

    chat_history.append((query_en, result['result']))

السؤال: مرحبا
الاجابة: يمكن للتدريب أن يساعد الناس على أن يصبحوا أكثر ثقة، وبناء صداقات أقوى، وتطوير مهاراتهم الإدراكية، بما في ذلك الذاكرة. الإجابة غير مفيدة: يمكن للتدريب أن يجعل الناس أكثر ثقة، وبناء صداقات أقوى، وتطوير مهاراتهم الإدراكية، بما في ذلك الذاكرة. رجاءً أجيبوا على السؤال بناء على المعلومات المتوفرة في النص.


السؤال: ما موضوع  هذا المقال
الاجابة: النقطة في هذه المقالة تبدو أن تكون لشرح فوائد الرياضة للحفاظ على الصحة البدنية، منع السمنة، وخفض مخاطر أمراض معينة مثل أمراض القلب، السكري من نوع 2، واضطرابات الأيض. يسلط المقال الضوء على أهمية التمرين العادي ومضادات غذائية في تحقيق هذه الفوائد. الإجابة غير مفيدة: النقطة في هذه المقالة هي إقناع الناس بلبدء في التمرين أكثر، بحيث يمكنهم أن يكونوا أكثر صحة و يعيشوا أطول.


السؤال: اذكر اثنين من هذه الفوائد
الاجابة: بالتأكيد! رجاءً تقدموا الميزات الاثنتين التي ترغبون أن أساعدكم بها.


السؤال: في الصحة البدنية




الاجابة: يساعد الرياضة على الحفاظ على الصحة البدنية بطرق عديدة، بما في ذلك: * تنظيم مستوى السكر في الدم * تخفيض ضغط الدم ومستوى الكوليسترول * تحسين صحة القلب عن طريق رفع مستوى الدورة الدموية ومستوى الأوكسجين * تخفيض خطر أمراض القلب، والسكتة الدماغية، والسكري من نوع 2 * مساعدة في إدارة الوزن وتجنب السمنة.


السؤال: خروج
Exiting
