# Convert the preprocessed output to vector embeddings using Hugging Face embeddings, store in Chroma DB and use retrival QA for validity.

Install all the necessary libraries

In [1]:
%pip install transformers langchain chromadb tiktoken pypdf sentence-transformers

Collecting transformers
  Downloading transformers-4.38.2-py3-none-any.whl.metadata (130 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.7/130.7 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting langchain
  Downloading langchain-0.1.12-py3-none-any.whl.metadata (13 kB)
Collecting chromadb
  Downloading chromadb-0.4.24-py3-none-any.whl.metadata (7.3 kB)
Collecting tiktoken
  Downloading tiktoken-0.6.0-cp311-cp311-macosx_10_9_x86_64.whl.metadata (6.6 kB)
Collecting pypdf
  Downloading pypdf-4.1.0-py3-none-any.whl.metadata (7.4 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-2.5.1-py3-none-any.whl.metadata (11 kB)
Collecting filelock (from transformers)
  Using cached filelock-3.13.1-py3-none-any.whl.metadata (2.8 kB)
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Downloading huggingface_hub-0.21.4-py3-none-any.whl.metadata (13 kB)
Collecting pyyaml>=5.1 (from transformers)
  Using cached PyYAML-

Mount your google drive

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')
import os
# curr_dir = os.path.dirname(__file__)
curr_dir = os.getcwd()
data_path = os.path.join(os.path.dirname(os.path.dirname(curr_dir)), 'data')
clean_text_file =os.path.join(data_path,'preprocessed_data','preprocessed_data_combined.txt')
print(clean_text_file)

/Users/laks007/Documents/coding/HyderabadIndiaChapter_MentalHealthWellbeingFomoSocialMedia/data/preprocessed_data/preprocessed_data_combined.txt


Load the combined preprocessed data directly from DagShub

In [None]:
# !wget https://dagshub.com/Omdena/HyderabadIndiaChapter_MentalHealthWellbeingFomoSocialMedia/raw/87afb46588c819d63d7d6444dc950101cf6b42fe/data/preprocessed_data/preprocessed_data_combined.txt

--2024-03-14 09:22:30--  https://dagshub.com/Omdena/HyderabadIndiaChapter_MentalHealthWellbeingFomoSocialMedia/raw/87afb46588c819d63d7d6444dc950101cf6b42fe/data/preprocessed_data/preprocessed_data_combined.txt
Resolving dagshub.com (dagshub.com)... 35.186.200.224
Connecting to dagshub.com (dagshub.com)|35.186.200.224|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/plain]
Saving to: ‘preprocessed_data_combined.txt’

preprocessed_data_c     [            <=>     ]  21.23M  8.57MB/s    in 2.5s    

2024-03-14 09:22:34 (8.57 MB/s) - ‘preprocessed_data_combined.txt’ saved [22256780]



Import the necessary libraries

In [4]:

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader
from chromadb.utils import embedding_functions
from transformers import AutoModel, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


Load our data using Text Loader and split it into chunks using Text Splitter

In [5]:
# doc = r"/content/preprocessed_data_combined.txt"

loader=TextLoader(clean_text_file)
docs=loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 50)
text = text_splitter.split_documents(docs)

In [9]:
print(text[10])
print(len(text))


page_content='so the sun attorney be try to question winona ryder mental health with nonsense after she testify in support of \nI owe patriot of america a sincere apology I use to be a liberal of the bad kind of canadian liberal smug and hat \nschweiden can you believe it all start a few year ago with haha what if we write sad fic on saturday night a \nananavarro so sad \nremember this moment when mewgulf hug each other because this the last workshop look at gulf face so sad he look like' metadata={'source': '/Users/laks007/Documents/coding/HyderabadIndiaChapter_MentalHealthWellbeingFomoSocialMedia/data/preprocessed_data/preprocessed_data_combined.txt'}
44874


convert the embeddings and store in ChromaDB

In [10]:
curr_dir = os.getcwd()
db_path = os.path.join(os.path.dirname(os.path.dirname(curr_dir)), 'src','vector_db','chroma_db')

# path = "/content/drive/MyDrive/data/chroma_db"

embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/multi-qa-MiniLM-L6-cos-v1')
vectordb = Chroma.from_documents(documents=text, persist_directory = db_path, embedding = embeddings)

An Alternative way to use HuggingFace embeddings but not used in this colab notebook

In [11]:
access_token = "your huggingface accesstoken"
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", token=access_token)
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", token=access_token)
persist_directory = db_path

# Access the API key from the access_token variable instead of the environment variable
huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
    api_key=access_token,
    model_name="sentence-transformers/all-MiniLM-L6-v2",
)


In [12]:
# persiste the db to disk
vectordb.persist()
vectordb = None

In [15]:
# Now we can load the persisted database from disk, and use it as normal.
vectordb = Chroma(persist_directory=db_path,
                  embedding_function=embeddings)

Create a retriever

In [16]:
retriever = vectordb.as_retriever()

In [None]:
def rag_retriever_vecordb():
    curr_dir = os.getcwd()
    db_path = os.path.join(os.path.dirname(os.path.dirname(curr_dir)), 'src','vector_db','chroma_db')
    embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/multi-qa-MiniLM-L6-cos-v1')
    vectordb = Chroma(persist_directory=db_path,
                  embedding_function=embeddings)
    retriever = vectordb.as_retriever()
    return retriever

Try to generate an output for random query

In [17]:
docs = retriever.get_relevant_documents("I am feeling lonely today")

In [18]:
len(docs) #By default, it gives four answers.

4

Make a retriever for getting only 2 top similar content

In [19]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})

In [20]:
retriever.search_type

'similarity'

In [21]:
docs = retriever.get_relevant_documents("I am feeling lonely today")
docs

[Document(page_content='so sad and yet so true   I be alone in my struggle', metadata={'source': '/Users/laks007/Documents/coding/HyderabadIndiaChapter_MentalHealthWellbeingFomoSocialMedia/data/preprocessed_data/preprocessed_data_combined.txt'}),
 Document(page_content='you be not alone this be a journey of life up and down you may not hear or even see the word with the response you will receive today and day after   I do not know you I be not sure of your struggle   I will not say I know how you feel because you be special I will say I have be where you be right now   I thank god everyday for life we survivor all be embrace who you be go for a walk and smell some flower                   the fact that you have write about your night mean you be way strong and', metadata={'source': '/Users/laks007/Documents/coding/HyderabadIndiaChapter_MentalHealthWellbeingFomoSocialMedia/data/preprocessed_data/preprocessed_data_combined.txt'})]

Use Retreival QA library to get the relevant output

In [22]:
from langchain.chains import RetrievalQA

Use mistral model

In [23]:
hf_repo_id = 'mistralai/Mistral-7B-Instruct-v0.1'

In [24]:
from langchain.llms import HuggingFaceHub
llm = HuggingFaceHub(
            repo_id=hf_repo_id,
            model_kwargs={"temperature": 0.2, "max_length": 32000}, huggingfacehub_api_token = access_token
        )

  warn_deprecated(


Storing into memory

In [25]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
retrieval = vectordb.as_retriever(k=2)
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retrieval, memory=memory)

In [26]:
## Cite only final response
def process_llm_response(llm_response):
    print(llm_response['result'])

In [27]:
# full example
query = "I am feeling very sad"
llm_response = qa(query)
process_llm_response(llm_response)

  warn_deprecated(


Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

sad 
I be actually sad in this pic  
I graduate   year ago   and even though I hate ceremony I be sad class of   will not even have the choice to attend one 
that pentagon video really hurt   so sad but they be so sweet 
nigga be flex with you all depression money 
gut feeling soft sculpture piece base on the way emotion manifest in physical sensation in the body specifically t 
I just b sad and lonely 
kuna mapenzi bila pesa

whenever I feel sad I do this 
its sad go to sleep every day when your heart be hurt 
petition to remove the   emoji my anxiety flare when I see it 
sweetsheil hug I be have lot of pain so I walk it off the stress be kill my fibro 
fake a smile be easy than explain why you be sad 
griffinpeneiope last one penelope nhave you ever be angry enough with clarke to the point you be hope josep 
one of the sim