# Artefact II - RAG using Two Collections

## 1. Initial Setup

In [1]:
# Python 
import os
import pandas as pd

# ChromaDB
import chromadb
from chromadb.config import Settings

# LangChain 
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_community.chat_models import ChatOllama

# Set OpenAi key as an environment variable
os.environ["sk-proj-Sm1whIiewmhrMZpq7rlBT3BlbkFJQ5PTl7vQLyybJYSzeUFC"] = ""

# Directory path
#PERSIST_DIRECTORY = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\vector_db"
PERSIST_DIRECTORY = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\datasets_db"

# Define the open-source embedding function ()
embedding_func = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")


### 1.1 Importing Comments

In [2]:
# Import users file
#path = "/home/ubuntu/thesis_GenAI/data/production_datasets/cleaned_datasets"
path = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\production_datasets\\cleaned_datasets"
file_name = "comments_cleaned.csv"
file_path = os.path.join(path, file_name)
df_comments = pd.read_csv(file_path)

# Create a sample of comments to test
df_comments_to_test = df_comments.sample(250)
df_comments_to_test.to_csv(os.path.join(path,"comments_sample.csv"), index=False)

# Print a sample
df_comments.sample(3)

Unnamed: 0,stream_name,created_date,post_id,comment_id,comment_text,like_count,report_count,username,author_user_id,author_position,author_status
48902,Jobs,2023-05-14 14:36:36,6579917,5725077,Just let me know,0,0,Brad_Mentzer,033e3592-37f8-4462-9eb0-f3cb38129b2d,Advanced Crew Leader,active
40035,Jobs,2023-07-06 11:01:08,6787738,5934909,Lol,0,0,Christine_Dioso,070b93d5-955d-4ed7-b6ac-98f1445e8737,Crew Leader,active
13687,"OSC, We Can Help",2023-12-19 01:00:48,7353091,6566749,I wasn't sure since I'm still a crew member ma...,0,0,Joseph_Gill,d9089068-40c1-42b3-9661-084b73efdba8,Crew Member,active


### 1.2 Importing Posts

In [3]:
# Import users file
#path = "/home/ubuntu/thesis_GenAI/data/production_datasets/cleaned_datasets"
path = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\production_datasets\\cleaned_datasets"
file_name = "posts_cleaned.csv"
file_path = os.path.join(path, file_name)
df_posts = pd.read_csv(file_path)
# Delete column 'labels'
df_posts = df_posts.drop(columns=['labels'])

# Create a sample of 50,000 comments to test
df_post_to_test = df_posts.sample(200)
df_post_to_test.to_csv(os.path.join(path,"posts_sample.csv"), index=False)

# Print a sample
df_posts.sample(3)

Unnamed: 0,stream_name,stream_id,post_id,action,created,title,text,like_count,comment_countt,mentions,username,author_user_id,author_position,author_status
17940,Jobs,9254,6391160,POSTED,2023-03-25 10:44:09,Available,Mot,1,1,,Mark_Matthews,72b8fdfd-5278-4fdd-bb81-581ed46f65eb,Advanced Crew Leader,active
7795,"OSC, We Can Help",22708,7110012,POSTED,2023-10-09 13:16:56,S/O GOING OUT,SHOUT OUT TO @Linwood_DavisJr @maurice_jackson...,12,0,"Linwood_DavisJr,maurice_jackson2,Jordan_Feltcorn",Hubert_Radcliff,2c1e808e-da89-4984-8d06-2def06daa397,Advanced Crew Leader,active
24603,Jobs,9254,5991182,POSTED,2022-11-18 13:58:49,Gm,I'll be Available After I Go with my Wife to t...,1,0,,Linwood_DavisJr,99d73931-1583-485d-afa7-ee1e20c05868,Advanced Crew Leader,active


## 2. Prepare Data for Embeddings

### 2.1 Comments

In [4]:
# Import comment's sample file
file_name = "comments_sample.csv"
file_path = os.path.join(path, file_name)
df_comments_to_test = pd.read_csv(file_path)

# Function to prepare the commments data to be loaded to the database
def prepare_comments_data(df):   
    documents = []
    metadatas = []
    ids = []
        
    for idx, row in df.iterrows(): 
        documents.append(row.iloc[4])
        metadatas.append({
            'stream_name': row.iloc[0],
            'created_date': row.iloc[1],
            'post_id': row.iloc[2],
            'comment_id': row.iloc[3],
            'like_count': row.iloc[5],
            'report_count': row.iloc[6],
            'username': row.iloc[7],
            'author_user_id': row.iloc[8],
            'author_position': row.iloc[9],
            'author_status': row.iloc[10]
        })
        ids.append(str(idx+1))       
    
    return documents, metadatas, ids

documents_comments, metadatas_comments, ids_comments = prepare_comments_data(df_comments_to_test)

print(type(documents_comments))
documents_comments[:10]

<class 'list'>


['Nice going, @John_Bright. Stay safe out there in the winter weather!',
 'Let me give you the run down I’d hate to give out the trick but it sounds like you need help you have to set your alarm for 3:30 if you get a job then no job board if you don’t then set your alarm for 5:29 and refresh 10 times a second until jobs come up they will only be up for 5 seconds and if you don’t click one and accept it your done you mi as well just stay up all night partying',
 'Available..',
 'Me',
 'Congratulations 🎉',
 '@Charles_Yates',
 'Congratulations',
 'That they dropped us permanently.',
 "@Karen_Stroup honestly I watched this and started to think. This is the type of job we are on every single day. This is close to home. This could have honestly been anyone of us out there. I thank God her injuries were mild and she will recover. At the same time it's terrifying to know there are people out there that can hit a person and keep going..",
 'I’ll go how about me']

In [5]:
# split it into chunks
text_splitter = CharacterTextSplitter(    
    separator="\n\n",
    chunk_size=500,
    chunk_overlap=0,
    length_function=len,
    is_separator_regex=False,        
    )

docs_comments = text_splitter.create_documents(documents_comments,
                                               metadatas=metadatas_comments
                                                )
docs_comments[:2]  

[Document(page_content='Nice going, @John_Bright. Stay safe out there in the winter weather!', metadata={'stream_name': 'Safety & Operations', 'created_date': '2024-01-17 14:41:47', 'post_id': 7442041, 'comment_id': 6673822, 'like_count': 0, 'report_count': 0, 'username': 'FlaggerForce', 'author_user_id': 'b701ab9f-563a-4425-a389-aff803a8da58', 'author_position': nan, 'author_status': 'active'}),
 Document(page_content='Let me give you the run down I’d hate to give out the trick but it sounds like you need help you have to set your alarm for 3:30 if you get a job then no job board if you don’t then set your alarm for 5:29 and refresh 10 times a second until jobs come up they will only be up for 5 seconds and if you don’t click one and accept it your done you mi as well just stay up all night partying', metadata={'stream_name': 'OSC, We Can Help', 'created_date': '2023-12-13 23:45:50', 'post_id': 7336415, 'comment_id': 6547238, 'like_count': 0, 'report_count': 0, 'username': 'Martin_Gie

### 2.2 Posts

In [6]:
# Import post's sample file
file_name = "posts_sample.csv"
file_path = os.path.join(path, file_name)
df_posts_to_test = pd.read_csv(file_path)
# print(df_posts_to_test.isnull().sum())

# Delete rows with NAN values in the column text
df_posts_to_test.dropna(inplace=True, subset=['text','username'])
# print()
# print(df_posts_to_test.isnull().sum())

In [7]:
# Function to prepare the posts data to be loaded to the database
def prepare_posts_data(df):   
    documents = []
    metadatas = []
    ids = []
        
    for idx, row in df.iterrows(): 
        documents.append(row.iloc[6])
        metadatas.append({
            'stream_name': row.iloc[0],
            'stream_id': row.iloc[1],
            'post_id': row.iloc[2],
            'action': row.iloc[3],            
            'created': row.iloc[4],
            'title': row.iloc[5],            
            'like_count': row.iloc[7],
            'comment_countt': row.iloc[8],
            'mentions': row.iloc[9],
            #'labels': row.iloc[10],
            'username': row.iloc[10],
            'author_user_id': row.iloc[11],
            'author_position': row.iloc[12],
            'author_status': row.iloc[13]
        })
        ids.append(str(idx+1))       
    
    return documents, metadatas, ids

documents_posts, metadatas_posts, ids_posts = prepare_posts_data(df_posts_to_test)

print(type(documents_posts))
documents_posts[:10]

<class 'list'>


['I am having a lot of trouble getting work. What do I do?Everytime I check the job bored it says nothing on there. I need work. I check the job bored all the time and I still get nothing. Even when I check it between 3:30 and 5:30 (I stay on the app) Help please',
 'VERA IS WORKING WITH 2 WONDERFUL COOWORKERS, WONDERFUL MENS. PLEASE BE CAREFUL OUT THERE IN THIS WEATHER STAY SAFE GOD BLESS YOU ALL.',
 'Started my first assignment today and it went great....so much so that i was literally just about to ask for more work on ITZ...when my cell chimed...and two more assignments to close the week out. And to think...i was worried about nothing.🤑🤑🤑🤑🤑🥰🥰🥰🥰😋',
 'Available @weekend_dispatch @weekend_dispatch @weekend_dispatch',
 'Is there a way you can get took off a order on a weekend without getting points ??',
 'I am still not getting anything for the week I am available too work day or night and if I need equipment just let me know where I need too get it \nAcl/seo',
 'Client didn’t show up 

In [8]:
""" # find the elements of the list documents_posts which are not of string type
for element in documents_posts:
    if not isinstance(element, str):
        print(documents_posts.index(element))
        
# Deleting the elements of the list with NaN values
documents_posts = [element for element in documents_posts if isinstance(element, str)] """

' # find the elements of the list documents_posts which are not of string type\nfor element in documents_posts:\n    if not isinstance(element, str):\n        print(documents_posts.index(element))\n        \n# Deleting the elements of the list with NaN values\ndocuments_posts = [element for element in documents_posts if isinstance(element, str)] '

In [9]:
# Text splitter

# split it into chunks
text_splitter = CharacterTextSplitter(    
    separator="\n\n",
    chunk_size=1000,
    chunk_overlap=0,
    length_function=len,
    is_separator_regex=False,        
    )

docs_posts = text_splitter.create_documents(documents_posts, 
                                            metadatas=metadatas_posts
                                            )
docs_posts[:2]  

[Document(page_content='I am having a lot of trouble getting work. What do I do?Everytime I check the job bored it says nothing on there. I need work. I check the job bored all the time and I still get nothing. Even when I check it between 3:30 and 5:30 (I stay on the app) Help please', metadata={'stream_name': 'Jobs', 'stream_id': 9254, 'post_id': 7251335, 'action': 'POSTED', 'created': '2023-11-17 16:12:44', 'title': nan, 'like_count': 1, 'comment_countt': 3, 'mentions': nan, 'username': 'Jeremie_Prater', 'author_user_id': '242e9a2c-fa00-46c8-9dad-a9889b3624aa', 'author_position': 'Crew Leader', 'author_status': 'active'}),
 Document(page_content='VERA IS WORKING WITH 2 WONDERFUL COOWORKERS, WONDERFUL MENS. PLEASE BE CAREFUL OUT THERE IN THIS WEATHER STAY SAFE GOD BLESS YOU ALL.', metadata={'stream_name': 'Water Break', 'stream_id': 6787, 'post_id': 7348758, 'action': 'POSTED', 'created': '2023-12-17 22:14:21', 'title': 'Stay safe, bad weather', 'like_count': 3, 'comment_countt': 0, 

## 4. Create and add data to database

In [22]:
""" import chromadb
from chromadb.config import Settings

client_settings = chromadb.config.Settings(        
    persist_directory=PERSIST_DIRECTORY,
    anonymized_telemetry=False,
    allow_reset=True
)

# save comments to disk
db_comments = Chroma.from_documents(docs_comments,
                           embedding_func,                           
                           persist_directory=PERSIST_DIRECTORY)

# Test the db
query = "FlaggerForce congratulations"
docs = db_comments.similarity_search(query)
print(docs[0].page_content) """

KeyboardInterrupt: 

In [None]:
""" # save posts to disk
db_posts = Chroma.from_documents(docs_posts,
                           embedding_func,                           
                           persist_directory=PERSIST_DIRECTORY)

# Test the db
query = "FlaggerForce congratulations"
docs = db_posts.similarity_search(query)
print(docs[0].page_content) """

Welcome to the best team u can have at flagger force


In [10]:
# Create db and comment's collection
PERSIST_DIRECTORY = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\vector_db"

# Initialize the database
vectordb_2datasets = chromadb.PersistentClient(path=PERSIST_DIRECTORY, 
                                               settings=Settings(allow_reset=True,
                                                                 ))
# Define the open-source embedding function ()
#embedding_func = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")
embedding_func = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# Create a collection for the comments
collection_comments = vectordb_2datasets.get_or_create_collection(name="collection_comments",
                                                                  #embedding_function=embedding_func
                                                                  )
collection_comments.add(documents=documents_comments,
                        metadatas=metadatas_comments,
                        ids=ids_comments
                        )

Add of existing embedding ID: 1
Add of existing embedding ID: 2
Add of existing embedding ID: 3
Add of existing embedding ID: 4
Add of existing embedding ID: 5
Add of existing embedding ID: 6
Add of existing embedding ID: 7
Add of existing embedding ID: 8
Add of existing embedding ID: 9
Add of existing embedding ID: 10
Add of existing embedding ID: 11
Add of existing embedding ID: 12
Add of existing embedding ID: 13
Add of existing embedding ID: 14
Add of existing embedding ID: 15
Add of existing embedding ID: 16
Add of existing embedding ID: 17
Add of existing embedding ID: 18
Add of existing embedding ID: 19
Add of existing embedding ID: 20
Add of existing embedding ID: 21
Add of existing embedding ID: 22
Add of existing embedding ID: 23
Add of existing embedding ID: 24
Add of existing embedding ID: 25
Add of existing embedding ID: 26
Add of existing embedding ID: 27
Add of existing embedding ID: 28
Add of existing embedding ID: 29
Add of existing embedding ID: 30
Add of existing emb

In [13]:
# Create a collection for the posts
collection_posts = vectordb_2datasets.get_or_create_collection(name="collection_posts",
                                                               )
collection_posts.add(documents=documents_posts,
                     metadatas=metadatas_posts,
                     ids=ids_posts)

In [12]:
# Delete the client
#vectordb_2datasets.reset()

# Delete the collection
#vectordb_2datasets.delete_collection(name="collection_comments")
#vectordb_2datasets.delete_collection(name="collection_posts")

In [14]:
# Define the open-source embedding function ()
#embedding_func = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")
embedding_func = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# Passing a Chroma Client into Langchain
langchain_chroma_comments = Chroma(client=vectordb_2datasets,                        
                         collection_name="collection_comments",
                         embedding_function=embedding_func,
)
print("There are", langchain_chroma_comments._collection.count(), "in the collection of comments.")


# Passing a Chroma Client into Langchain
langchain_chroma_posts = Chroma(client=vectordb_2datasets,                        
                         collection_name="collection_posts",
                         embedding_function=embedding_func,
)
print("There are", langchain_chroma_posts._collection.count(), "in the collection of comments.")


There are 250 in the collection of comments.
There are 198 in the collection of comments.


In [16]:
# Metadata description
metadata_field_info_comments = [
    AttributeInfo(
        name="stream_name",
        description="The stream where the comment was poste. One of ['Jobs','Water Break','Safety & Operations','OSC, We Can Help','Flagger Force Connect','The Whiteboard','Test Stream']",
        type="object",
    ),
    AttributeInfo(
        name="created_date",
        description="The datetime when the comment was posted",
        type="object",
    ),
    AttributeInfo(
        name="post_id",
        description="The id of the post where the comment was posted",
        type="int64",
    ),
    AttributeInfo(
        name="comment_id",
        description="The id of the comment",
        type="int64",
    ),
    AttributeInfo(
        name="like_count",
        description="The number of likes received by the comment",
        type="int64",        
    ),    
    AttributeInfo(
        name="report_count",
        description="The number of reports where the comment appears",
        type="int64",
    ),
    AttributeInfo(
        name="username",
        description="The username of the author of the comment",
        type="object",
    ),
    AttributeInfo(
        name="author_user_id",
        description="The id of the author of the comment",
        type="object",
    ),    
    AttributeInfo(
        name="author_position",
        description="The position of the author of the comment. One of ['Advanced Crew Leader','Crew Member','Crew Leader','Weekend Dispatch','Lead Instructor','Area Supervisor','Field Trainer 1','Warehouse Coordinator','Field Trainer 2','Executive Assistant','Internal Communications Manager','Field Manager','Safety Professional','Internal Communications Coordinator', 'Employee Services Supervisor','Safety Advocate', and many more...]",
        type="object",
    ),
    AttributeInfo(
        name="author_status",
        description="The status of the author of the comment. One of ['active','suspended']",
        type="object",
    ),
]

# Creating our self-querying retriever
document_content_description = "Comments on social network posts"
llm = ChatOllama(model="mistral")

retriever_comments = SelfQueryRetriever.from_llm(    
    llm,
    langchain_chroma_comments,
    document_content_description,    
    metadata_field_info_comments,
)

In [17]:
# This example only specifies a query
results_a = retriever_comments.invoke("What are some comments about birthday congratulations")
results_a

In [None]:
# Posts Metadata description
metadata_field_info_post = [
    AttributeInfo(
        name="stream_name",
        description="The stream where the comment was poste. One of ['Jobs','Water Break','Safety & Operations','OSC, We Can Help','Flagger Force Connect','The Whiteboard','Test Stream']",
        type="object",
    ),
    AttributeInfo(
        name="stream_id",
        description="The id of the stream where the post was posted",
        type="int64",
    ),
    AttributeInfo(
        name="post_id",
        description="The id of the post",
        type="int64",
    ),
    AttributeInfo(
        name="action",
        description="The id of the post where the comment was posted",
        type="object",
    ),
    AttributeInfo(
        name="created",
        description="The datetime when the post was create",
        type="object",
    ),
    AttributeInfo(
        name="title",
        description="The title of the post",
        type="object",
    ),
    AttributeInfo(
        name="like_count",
        description="The number of likes received by the post",
        type="int64",        
    ),
    AttributeInfo(
        name="comment_countt",
        description="The number of comments received by the post",
        type="int64",        
    ),   
    AttributeInfo(
        name="mentions",
        description="The usernames that were mention in the post",
        type="object",
    ),        
    # AttributeInfo(
    #     name="labels",
    #     description="The keywords found in the post",
    #     type="int64",
    # ),
    AttributeInfo(
        name="username",
        description="The username of the author of the post",
        type="object",
    ),
    AttributeInfo(
        name="author_user_id",
        description="The id of the author of the post",
        type="object",
    ),    
    AttributeInfo(
        name="author_position",
        description="The position of the author of the post. One of ['Advanced Crew Leader','Crew Member','Crew Leader','Weekend Dispatch','Lead Instructor','Area Supervisor','Field Trainer 1','Warehouse Coordinator','Field Trainer 2','Executive Assistant','Internal Communications Manager','Field Manager','Safety Professional','Internal Communications Coordinator', 'Employee Services Supervisor','Safety Advocate', and many more...]",
        type="object",
    ),
    AttributeInfo(
        name="author_status",
        description="The status of the author of the post. One of ['active','suspended']",
        type="object",
    ),
]

# Creating our self-querying retriever
document_content_description = "Posts in social network"
llm = ChatOllama(model="mistral")

retriever_posts = SelfQueryRetriever.from_llm(    
    llm,
    langchain_chroma_posts,
    document_content_description,    
    metadata_field_info_post,
)

In [34]:
# This example only specifies a query
result_b = retriever_posts.invoke("What are two posts about birthday congratulations")
result_b

OutputParserException: Parsing text
 ```json
{
    "query": "congratulations",
    "filter": "and(or(eq(\"labels\".\"0\", \"congratulations\"), eq(\"labels\".\"1\", \"congratulations\")))"
}
```
 raised following error:
No terminal matches '.' in the current parser context, at line 1 col 19

and(or(eq("labels"."0", "congratulations"), eq("labels"."1
                  ^
Expected one of: 
	* COMMA
	* RPAR
	* RSQB

Previous tokens: Token('ESCAPED_STRING', '"labels"')


In [99]:
# Passing a Chroma Client into Langchain
langchain_chroma = Chroma(    
    client=vectordb_2datasets,
    collection_name="collection_comments",
    #embedding_function=embedding_func,
)

print("There are", langchain_chroma._collection.count(), "in the collection of comments.")
#print("There are", langchain_chroma._collection_posts.count(), "in the collection of posts.")

There are 200 in the collection of comments.


In [100]:
# Passing a Chroma Client into Langchain
langchain_chroma_post = Chroma(    
    client=vectordb_2datasets,
    collection_name="collection_posts",
    #embedding_function=embedding_func,
)

print("There are", langchain_chroma_post._collection.count(), "in the collection.")
#print("There are", langchain_chroma._collection_posts.count(), "in the collection of posts.")
langchain_chroma_post._collection

There are 50 in the collection.


Collection(name=collection_posts)

In [101]:
# Test the langchain_chroma_post
query = "What it got said about Jonathan_Muha"
docs = langchain_chroma_post.similarity_search(query)
print(docs[0].page_content)

ValueError: You must provide embeddings or a function to compute them

In [35]:
# Create a database

# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")

# save to disk
db = Chroma.from_documents(docs,
                           embedding_function,
                           #ids=ids,
                           persist_directory=PERSIST_DIRECTORY)

# Test the db
query = "What it got said about Jonathan_Muha"
docs = db.similarity_search(query)
print(docs[0].page_content)

Have a safe day, @Jonathan_Muha!


## 3. Creating our self-querying retriever

Now we can instantiate our retriever. To do this we’ll need to provide some information upfront about the metadata fields that our documents support and a short description of the document contents.

In [18]:
# Metadata description
metadata_field_info = [
    AttributeInfo(
        name="stream_name",
        description="The stream where the comment was poste. One of ['Jobs','Water Break','Safety & Operations','OSC, We Can Help','Flagger Force Connect','The Whiteboard','Test Stream']",
        type="object",
    ),
    AttributeInfo(
        name="created_date",
        description="The datetime when the comment was posted",
        type="object",
    ),
    AttributeInfo(
        name="post_id",
        description="The id of the post where the comment was posted",
        type="int64",
    ),
    AttributeInfo(
        name="comment_id",
        description="The id of the comment",
        type="int64",
    ),
    AttributeInfo(
        name="like_count",
        description="The number of likes received by the comment",
        type="int64",        
    ),    
    AttributeInfo(
        name="report_count",
        description="The number of reports where the comment appears",
        type="int64",
    ),
    AttributeInfo(
        name="username",
        description="The username of the author of the comment",
        type="object",
    ),
    AttributeInfo(
        name="author_user_id",
        description="The id of the author of the comment",
        type="object",
    ),    
    AttributeInfo(
        name="author_position",
        description="The position of the author of the comment. One of ['Advanced Crew Leader','Crew Member','Crew Leader','Weekend Dispatch','Lead Instructor','Area Supervisor','Field Trainer 1','Warehouse Coordinator','Field Trainer 2','Executive Assistant','Internal Communications Manager','Field Manager','Safety Professional','Internal Communications Coordinator', 'Employee Services Supervisor','Safety Advocate', and many more...]",
        type="object",
    ),
    AttributeInfo(
        name="author_status",
        description="The position of the author of the comment. One of ['active','suspended']",
        type="object",
    ),
]

# Creating our self-querying retriever
document_content_description = "Comments on social network posts"
llm = ChatOllama(model="mistral")

retriever = SelfQueryRetriever.from_llm(    
    llm,
    db,
    document_content_description,    
    metadata_field_info,
)

### 3.1 Testing out the retriever

In [22]:
# This example only specifies a query
retriever.invoke("What are two comments about congratulations")

[Document(page_content='Congratulations', metadata={'author_position': 'Crew Leader', 'author_status': 'suspended', 'author_user_id': 'e62d292b-7a01-4aa9-ab5a-e6fbfaa6e326', 'comment_id': 3927754, 'created_date': '2022-01-26 00:17:59', 'like_count': 0, 'post_id': 4989194, 'report_count': 0, 'stream_name': 'Safety & Operations', 'username': 'Donnie_Ziegler'}),
 Document(page_content='Congratulations', metadata={'author_position': 'Advanced Crew Leader', 'author_status': 'active', 'author_user_id': '5a3ca39e-5c41-4501-8eab-6a9ed0a62044', 'comment_id': 4686674, 'created_date': '2022-08-14 02:01:54', 'like_count': 0, 'post_id': 5643777, 'report_count': 0, 'stream_name': 'Safety & Operations', 'username': 'William_MobleyJr'}),
 Document(page_content='Congratulations', metadata={'author_position': 'Crew Member', 'author_status': 'suspended', 'author_user_id': '74f383df-5aa7-4efb-af99-5f96405dadc8', 'comment_id': 5163635, 'created_date': '2022-12-17 00:01:52', 'like_count': 0, 'post_id': 6082

In [23]:
df_comments_to_test['username'].value_counts()

username
FlaggerForce        8
weekend_dispatch    8
Karen_Stroup        8
Linwood_DavisJr     6
Jessica_Beers       5
                   ..
Deon_McDaniel       1
Adrienne_Long       1
Stephen_Michael     1
Towanda_Gordon      1
Kyndra_Edwards      1
Name: count, Length: 201, dtype: int64

In [28]:
# This example only specifies a filter
retriever.invoke("What is a highly liked (above 4) comment")

[Document(page_content='What did the buffalo say to his son when he dropped him off at school?\nBison!!!! 😂🤣😭 @Jeffrey_Williams @Seth_Lee 😂🤣😭', metadata={'author_position': 'Field Trainer 1', 'author_status': 'suspended', 'author_user_id': 'f1555029-1597-4d3e-9199-4ebc04ed9702', 'comment_id': 4695288, 'created_date': '2022-08-16 19:19:26', 'like_count': 6, 'post_id': 5653172, 'report_count': 0, 'stream_name': 'Water Break', 'username': 'Collette_Monaghan'}),
 Document(page_content='As @Julie_Snedeker mentioned the focus here is to stop tampering with Flagger Forced issued equipment. The cameras believe it or not are in the vehicles for YOUR SAFETY. If you pick up the cell phone it says to put it down... if your following to close it says to back off... it is easy to sit and nitpick a photo but it is difficult to follow company policy.', metadata={'author_position': 'Safety Manager', 'author_status': 'active', 'author_user_id': '2aac35e8-f970-4434-9b46-56274384cf89', 'comment_id': 66010

In [27]:
# group by to find the comments with highest number of likes
df_comments_to_test.groupby('comment_id')['like_count'].sum().sort_values(ascending=False)

comment_id
6601009    6
4695288    6
5959837    4
3993841    4
6468179    4
          ..
5031138    0
5049534    0
5067432    0
5068273    0
6865894    0
Name: like_count, Length: 300, dtype: int64

In [None]:
# Deleting the client
import chromadb
from chromadb.config import Settings
PERSIST_DIRECTORY = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\datasets_db"

chromadb_client = chromadb.PersistentClient(path=PERSIST_DIRECTORY, settings=Settings(allow_reset=True))
chromadb_client.reset()

In [None]:
query = "Thank you Robert, I appreciate your knowledge"
docs = db.similarity_search(query)
print(docs[0].page_content)


In [None]:
import chromadb
from chromadb.config import Settings
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings


PERSIST_DIRECTORY = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\datasets_db"
# embeddings = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")
embeddings = OllamaEmbeddings(model="mistral") 

persistent_client = chromadb.PersistentClient(path=PERSIST_DIRECTORY, settings=Settings(allow_reset=True))
collection_comments = persistent_client.get_or_create_collection("collection_comments",
                                                        metadata={"hnsw:space": "l2"}, # Squared L2 norm(l2) is the default, inner product('ip') or cosine similarity('cosine')                                                              
                                                        embedding_function=embeddings
                                                        )
 # Add data to collection
collection_comments.add(documents=documents,
                        metadatas=metadatas,
                        ids=ids
                        )

langchain_chroma = Chroma(
    client=persistent_client,
    collection_name="collection_comments",
    embedding_function=embeddings
)

print("There are", langchain_chroma._collection.count(), "in the collection")

In [None]:
# Deleting the client
import chromadb
from chromadb.config import Settings
PERSIST_DIRECTORY = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\datasets_db"

chromadb_client = chromadb.PersistentClient(path=PERSIST_DIRECTORY, settings=Settings(allow_reset=True))
chromadb_client.reset()

In [None]:
import chromadb
#from langchain_chroma import Chroma
#from langchain_community.embeddings import OllamaEmbeddings
#import chromadb.utils.embedding_functions as embedding_functions
from chromadb.config import Settings
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma

      
PERSIST_DIRECTORY = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\datasets_db"
embeddings = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")    
    

# Function to create and load data to the database
"""
The embedding function takes text as input, and performs tokenization and embedding. If no embedding function is supplied, Chroma will use sentence transformer as a default.
https://docs.trychroma.com/embeddings#sentence-transformers
By default, Chroma uses all-MiniLM-L6-v2
"""


def upload_to_chromadb(documents, metadatas, ids):
    try:            
        # create the open-source embedding function
        # embedding = OllamaEmbeddings(model="mistral")
        # embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")   # Default
        embeddings = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")          # Best model from included in ChromaDB
        
        # Initialize ChromaDB client
        # load it into Chroma
        db = Chroma.from_documents(documents, embeddings,
                                    persist_directory=PERSIST_DIRECTORY)
               
        # Create a comment's collection
        collection_comments = chromadb_client.create_collection(name="comments_collection",
                                                              metadata={"hnsw:space": "l2"}, # Squared L2 norm(l2) is the default, inner product('ip') or cosine similarity('cosine')                                                              
                                                              embedding_function=embeddings
                                                              )        
        
        # Add data to collection
        collection_comments.add(documents=documents,
                                metadatas=metadatas,
                                ids=ids
                                )
        print("Data uploaded successfully.")
    except Exception as e:
        print(f"Failed to upload data: {e}")

upload_to_chromadb(documents, metadatas, ids)