# Artefact II - RAG using Two Collections

## 1. Initial Setup

In [6]:
# Python 
import os
import pandas as pd

# ChromaDB
import chromadb
from chromadb.config import Settings

# LangChain 
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_community.chat_models import ChatOllama


# Directory path
#PERSIST_DIRECTORY = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\vector_db"
PERSIST_DIRECTORY = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\datasets_db"

# Define the open-source embedding function ()
embedding_func = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")


  from .autonotebook import tqdm as notebook_tqdm


### 1.1 Importing Comments

In [9]:
# Import users file
path = "/home/ubuntu/thesis_GenAI/data/production_datasets/cleaned_datasets"
#path = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\production_datasets\\cleaned_datasets"

file_name = "comments_cleaned.csv"
file_path = os.path.join(path, file_name)
df_comments = pd.read_csv(file_path)

# Create a sample of comments to test
df_comments_to_test = df_comments.sample(250, random_state=123)
df_comments_to_test.to_csv(os.path.join(path,"comments_sample.csv"), index=False)

# Print a sample
df_comments.sample(3)

Unnamed: 0,stream_name,created_date,post_id,comment_id,comment_text,like_count,report_count,username,author_user_id,author_position,author_status
11882,Jobs,2023-12-29 21:10:33,7386024,6606515,Available,0,0,Corbin_Kesner,8b12d0d2-9fb5-474a-a896-f1ce1c2197d9,Crew Member,active
61912,Safety & Operations,2023-03-06 21:28:14,6276963,5453365,Congratulations to the following:\n\n@Dominiqu...,4,0,FlaggerForce,b701ab9f-563a-4425-a389-aff803a8da58,,active
22170,Jobs,2023-10-20 17:03:45,7151430,6342159,Hey @Connard_Edlin I've put you on the availab...,1,0,client_services_transition,f206abd1-e46c-4d35-be75-a3b937edf885,,active


### 1.2 Importing Posts

In [10]:
# Import users file
path = "/home/ubuntu/thesis_GenAI/data/production_datasets/cleaned_datasets"
#path = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\production_datasets\\cleaned_datasets"
file_name = "posts_cleaned.csv"
file_path = os.path.join(path, file_name)
df_posts = pd.read_csv(file_path)

# Delete column 'labels'
df_posts = df_posts.drop(columns=['labels'])
# Rename comment_count column
df_posts.rename({'comment_countt': 'comment_count'}, axis=1, inplace=True)
# Replace in column 'text' the value '\u200d♀️' by ''
df_posts['text'] = df_posts['text'].str.replace('\u200d♀️', '')

# Create a sample of 50,000 comments to test
# set a seed

df_post_to_test = df_posts.sample(200, random_state=123)
df_post_to_test.to_csv(os.path.join(path,"posts_sample.csv"), index=False)

# Print a sample
df_posts.sample(3)

Unnamed: 0,stream_name,stream_id,post_id,action,created,title,text,like_count,comment_count,mentions,username,author_user_id,author_position,author_status
37439,Test Stream,6943,5219691,POSTED,2022-04-11 12:51:06,"Basic Work Zone Training, Day 2, SPEED....",As part of an unscheduled visit to our King of...,0,0,"Queteya_SandersCole,Latif_Miller,Steven_Jones,...",luke_lazar,7cac4e9f-323b-449c-b78a-3e51945df4a5,VP of Risk & Safety,active
22391,"OSC, We Can Help",22708,6107804,POSTED,2022-12-24 19:38:22,I need help changing my direct deposit,I looked everywhere on the UKG app trying to f...,0,2,,Dylan_Dewitt,9e93b30b-157d-4762-a283-04ba11a3e712,Crew Member,active
37440,Water Break,6787,5219641,POSTED,2022-04-11 12:40:05,Fresh week fresh site,Its my first day and i already love it @somewh...,4,1,,Christopher_Schaffer,dbe4b413-342b-4103-970b-f6f398d01346,Crew Leader,suspended


## 2. Prepare Data for Embeddings

### 2.1 Comments

In [11]:
# Import comment's sample file
file_name = "comments_sample.csv"
file_path = os.path.join(path, file_name)
df_comments_to_test = pd.read_csv(file_path)

# Function to prepare the commments data to be loaded to the database
def prepare_comments_data(df):   
    documents = []
    metadatas = []
    ids = []
        
    for idx, row in df.iterrows(): 
        documents.append(row.iloc[4])
        metadatas.append({
            'stream_name': row.iloc[0],
            'created_date': row.iloc[1],
            'post_id': row.iloc[2],
            'comment_id': row.iloc[3],
            'like_count': row.iloc[5],
            'report_count': row.iloc[6],
            'username': row.iloc[7],
            'author_user_id': row.iloc[8],
            'author_position': row.iloc[9],
            'author_status': row.iloc[10]
        })
        ids.append(str(idx+1))       
    
    return documents, metadatas, ids

documents_comments, metadatas_comments, ids_comments = prepare_comments_data(df_comments_to_test)

print(type(documents_comments))
documents_comments[:10]

<class 'list'>


['Ok',
 '@Waylon_Curtis',
 'Wow.... you too',
 '**This order has been cancelled**',
 'We can not control what other companies do. It is not our place. \n    \n    I even commented that it was a good video to remind ourselves that WE can and will do better based upon our training.',
 "I'm available",
 "@Delano_Haines... It was great to see you today. I'm happy to hear that Seth and his crew is pleased with us. As for training  @Natisha_Nwankwo and @Jeffrey_Brodsky .. they make my job easy and fun. They are both very eager to learn everything. Btw, today I personally didn't set the cones. Jeff set the transition and tangent, and Natisha set the termination with a little bit of assistance. I'm very proud of both of them.  I'll keep on doing what I do best. \nI appreciate the kind words.",
 'What is the scope of this job',
 'Congratulations and welcome back!! \n🎉👷🏼\u200d♀️🛑🎉',
 '@Portia_Dougherty it’s for crew lead and up only']

In [12]:
# split it into chunks
text_splitter = CharacterTextSplitter(    
    separator="\n\n",
    chunk_size=500,
    chunk_overlap=0,
    length_function=len,
    is_separator_regex=False,        
    )

docs_comments = text_splitter.create_documents(documents_comments,
                                               metadatas=metadatas_comments
                                                )
docs_comments[:2]  

[Document(page_content='Ok', metadata={'stream_name': 'Jobs', 'created_date': '2022-04-02 13:36:11', 'post_id': 5195364, 'comment_id': 4179344, 'like_count': 1, 'report_count': 0, 'username': 'Christopher_Young', 'author_user_id': '940f777e-ebc8-4bae-b1c8-aecb92a62438', 'author_position': 'Crew Leader', 'author_status': 'suspended'}),
 Document(page_content='@Waylon_Curtis', metadata={'stream_name': 'Jobs', 'created_date': '2022-11-13 21:47:13', 'post_id': 5974128, 'comment_id': 5033739, 'like_count': 0, 'report_count': 0, 'username': 'John_Ware', 'author_user_id': 'e93ab132-8a53-4a66-89a6-b256e82b7956', 'author_position': 'Advanced Crew Leader', 'author_status': 'suspended'})]

### 2.2 Posts

In [13]:
# Import post's sample file
file_name = "posts_sample.csv"
file_path = os.path.join(path, file_name)
df_posts_to_test = pd.read_csv(file_path)
# print(df_posts_to_test.isnull().sum())

# Delete rows with NAN values in the column text
df_posts_to_test.dropna(inplace=True, subset=['text','username'])
# print()
print(df_posts_to_test.isnull().sum())

stream_name          0
stream_id            0
post_id              0
action               0
created              0
title               31
text                 0
like_count           0
comment_count        0
mentions           166
username             0
author_user_id       0
author_position     39
author_status        0
dtype: int64


In [14]:
# Function to prepare the posts data to be loaded to the database
def prepare_posts_data(df):   
    documents = []
    metadatas = []
    ids = []
        
    for idx, row in df.iterrows(): 
        documents.append(row.iloc[6])
        metadatas.append({
            'stream_name': row.iloc[0],
            'stream_id': row.iloc[1],
            'post_id': row.iloc[2],
            'action': row.iloc[3],            
            'created': row.iloc[4],
            'title': row.iloc[5],            
            'like_count': row.iloc[7],
            'comment_count': row.iloc[8],
            'mentions': row.iloc[9],
            #'labels': row.iloc[10],
            'username': row.iloc[10],
            'author_user_id': row.iloc[11],
            'author_position': row.iloc[12],
            'author_status': row.iloc[13]
        })
        ids.append(str(idx+1))       
    
    return documents, metadatas, ids

documents_posts, metadatas_posts, ids_posts = prepare_posts_data(df_posts_to_test)

print(type(documents_posts))
documents_posts[:10]

<class 'list'>


['I’m up now I’m available @client_services_am @weekend_dispatch',
 'BEEN UNDER THE WEATHER FOR A FEW DAYS BUT I CAN’T COMPLAIN CAUSE I’M STILL LIVING & ABLE….LET’S MAKE TODAY BETTER THAN YESTERDAY',
 'Client: Verizon - MD\nDate: 1/19/2024\nTime: ASAP\nAddress:  Flagger Force\nOrder: 574952\nNotes:  Please comment if you are available, thank you!',
 'Client: NPL - Washington, PA\nDate: 12/13/22\nAddress: 69 Market St Brownsville Pa \nOrder# 536569\nStart Time:  ASAP\nNotes: Please comment below or call the OSC if you are available.',
 'It’s suppose to storm tonight so I’m available @client_services_transition @client_services_pm @client_services_transition @client_services_pm',
 'Order: 577546\nClient: BGE Front Street\nAddress: 1101 Russell Street, Baltimore\nDate: 2/16/24\nStart: 19:00\n\n\nOrder: 577549\nClient: BGE Piney Orchard\nAddress: 730 New Waugh Chapel Road, Ondeton\nDate: 2/16/24\nStart: 19:00',
 'Cl available in cpa but willing to travel',
 'Pick Up: 220 8th Avenue NW, Gle

In [15]:
""" # find the elements of the list documents_posts which are not of string type
for element in documents_posts:
    if not isinstance(element, str):
        print(documents_posts.index(element))
        
# Deleting the elements of the list with NaN values
documents_posts = [element for element in documents_posts if isinstance(element, str)] """

' # find the elements of the list documents_posts which are not of string type\nfor element in documents_posts:\n    if not isinstance(element, str):\n        print(documents_posts.index(element))\n        \n# Deleting the elements of the list with NaN values\ndocuments_posts = [element for element in documents_posts if isinstance(element, str)] '

In [16]:
# Text splitter

# split it into chunks
text_splitter = CharacterTextSplitter(    
    separator="\n\n",
    chunk_size=1000,
    chunk_overlap=0,
    length_function=len,
    is_separator_regex=False,        
    )

docs_posts = text_splitter.create_documents(documents_posts, 
                                            metadatas=metadatas_posts
                                            )
docs_posts[:2]  

[Document(page_content='I’m up now I’m available @client_services_am @weekend_dispatch', metadata={'stream_name': 'Jobs', 'stream_id': 9254, 'post_id': 6988496, 'action': 'POSTED', 'created': '2023-09-04 08:26:06', 'title': nan, 'like_count': 4, 'comment_count': 0, 'mentions': 'client_services_am,weekend_dispatch', 'username': 'Latasha_Clayton', 'author_user_id': '6e89af7c-c527-493a-93f7-a8c0205c331d', 'author_position': 'Advanced Crew Leader', 'author_status': 'suspended'}),
 Document(page_content='BEEN UNDER THE WEATHER FOR A FEW DAYS BUT I CAN’T COMPLAIN CAUSE I’M STILL LIVING & ABLE….LET’S MAKE TODAY BETTER THAN YESTERDAY', metadata={'stream_name': 'Water Break', 'stream_id': 6787, 'post_id': 5066545, 'action': 'POSTED', 'created': '2022-02-21 14:38:06', 'title': nan, 'like_count': 8, 'comment_count': 6, 'mentions': nan, 'username': 'Dontavius_Smith', 'author_user_id': '93f404c0-922b-49f9-b1cb-ecdbc0703e37', 'author_position': 'Area Supervisor', 'author_status': 'active'})]

## 4. Create and add data to database

### 4.1 First approach: Load data with LangChain

In [11]:
# First approach: Set the client with chromadb but load data with Chroma from LangChain

""" import chromadb
from chromadb.config import Settings

client_settings = chromadb.config.Settings(        
    persist_directory=PERSIST_DIRECTORY,
    anonymized_telemetry=False,
    allow_reset=True
)

# save comments to disk
db_comments = Chroma.from_documents(docs_comments,
                           embedding_func,                           
                           persist_directory=PERSIST_DIRECTORY)

# Test the db
query = "FlaggerForce congratulations"
docs = db_comments.similarity_search(query)
print(docs[0].page_content) """

Welcome to the best team u can have at flagger force


In [12]:
# This is here just as documentation

""" # save posts to disk
db_posts = Chroma.from_documents(docs_posts,
                           embedding_func,                           
                           persist_directory=PERSIST_DIRECTORY)

# Test the db
query = "FlaggerForce congratulations"
docs = db_posts.similarity_search(query)
print(docs[0].page_content) """

Anyways, Flagger force has opened up my eyes to a whole new future and I can’t be more excited for the journey I have ahead of me. Knowing that working in SC will be a few week project, I can’t help but feel honored to be selected as one of the few to go down and represent FF in hopes to obtain more work in the area. WE GOTTA SHINE AS BRIGHT AS OUR PPE. 😂😂. 

shout out to @Danny_Rice and his team granted I can be a pain and run my mouth occasionally, y’all’s hard work doesn’t go unnoticed. I really do appreciate  the hard work and effort y’all put in. I look forward to making y’all proud. 

well if you made it this far, stay safe Ff family! 

@Keionna_Lee @Ayeisha_Forbes @Matthew_Geis @Kyle_Schall


### 4.2 Second Approach: Load data with Chromadb

In [17]:
# Create db and comment's collection
#PERSIST_DIRECTORY = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\vector_db"
PERSIST_DIRECTORY = "/home/ubuntu/thesis_GenAI/data/vector_db"

# Initialize the database
vectordb_2datasets = chromadb.PersistentClient(path=PERSIST_DIRECTORY, 
                                               settings=Settings(allow_reset=True),
                                                                                                              
                                                                 )
# Define the open-source embedding function ()
#embedding_func = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")
embedding_func = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# Create a collection for the comments
collection_comments = vectordb_2datasets.get_or_create_collection(name="collection_comments",
                                                                  #embedding_function=embedding_func
                                                                  )
collection_comments.add(documents=documents_comments,
                        metadatas=metadatas_comments,
                        ids=ids_comments
                        )

Add of existing embedding ID: 1
Add of existing embedding ID: 2
Add of existing embedding ID: 3
Add of existing embedding ID: 4
Add of existing embedding ID: 5
Add of existing embedding ID: 6
Add of existing embedding ID: 7
Add of existing embedding ID: 8
Add of existing embedding ID: 9
Add of existing embedding ID: 10
Add of existing embedding ID: 11
Add of existing embedding ID: 12
Add of existing embedding ID: 13
Add of existing embedding ID: 14
Add of existing embedding ID: 15
Add of existing embedding ID: 16
Add of existing embedding ID: 17
Add of existing embedding ID: 18
Add of existing embedding ID: 19
Add of existing embedding ID: 20
Add of existing embedding ID: 21
Add of existing embedding ID: 22
Add of existing embedding ID: 23
Add of existing embedding ID: 24
Add of existing embedding ID: 25
Add of existing embedding ID: 26
Add of existing embedding ID: 27
Add of existing embedding ID: 28
Add of existing embedding ID: 29
Add of existing embedding ID: 30
Add of existing emb

In [18]:
# returns a list of the first 10 items in the collection
collection_comments.peek()['documents']

['Ok',
 '@Waylon_Curtis',
 'Wow.... you too',
 '**This order has been cancelled**',
 'We can not control what other companies do. It is not our place. \n    \n    I even commented that it was a good video to remind ourselves that WE can and will do better based upon our training.',
 "I'm available",
 "@Delano_Haines... It was great to see you today. I'm happy to hear that Seth and his crew is pleased with us. As for training  @Natisha_Nwankwo and @Jeffrey_Brodsky .. they make my job easy and fun. They are both very eager to learn everything. Btw, today I personally didn't set the cones. Jeff set the transition and tangent, and Natisha set the termination with a little bit of assistance. I'm very proud of both of them.  I'll keep on doing what I do best. \nI appreciate the kind words.",
 'What is the scope of this job',
 'Congratulations and welcome back!! \n🎉👷🏼\u200d♀️🛑🎉',
 '@Portia_Dougherty it’s for crew lead and up only']

In [19]:
# Create a collection for the posts
collection_posts = vectordb_2datasets.get_or_create_collection(name="collection_posts",
                                                               )
collection_posts.add(documents=documents_posts,
                     metadatas=metadatas_posts,
                     ids=ids_posts)

Add of existing embedding ID: 1
Add of existing embedding ID: 2
Add of existing embedding ID: 3
Add of existing embedding ID: 4
Add of existing embedding ID: 5
Add of existing embedding ID: 6
Add of existing embedding ID: 7
Add of existing embedding ID: 8
Add of existing embedding ID: 9
Add of existing embedding ID: 10
Add of existing embedding ID: 11
Add of existing embedding ID: 12
Add of existing embedding ID: 13
Add of existing embedding ID: 14
Add of existing embedding ID: 15
Add of existing embedding ID: 16
Add of existing embedding ID: 17
Add of existing embedding ID: 18
Add of existing embedding ID: 19
Add of existing embedding ID: 20
Add of existing embedding ID: 21
Add of existing embedding ID: 22
Add of existing embedding ID: 23
Add of existing embedding ID: 24
Add of existing embedding ID: 25
Add of existing embedding ID: 26
Add of existing embedding ID: 27
Add of existing embedding ID: 28
Add of existing embedding ID: 29
Add of existing embedding ID: 30
Add of existing emb

In [20]:
# returns a list of the first 10 items in the collection
collection_posts.peek()['documents']

['I’m up now I’m available @client_services_am @weekend_dispatch',
 'BEEN UNDER THE WEATHER FOR A FEW DAYS BUT I CAN’T COMPLAIN CAUSE I’M STILL LIVING & ABLE….LET’S MAKE TODAY BETTER THAN YESTERDAY',
 'Client: Verizon - MD\nDate: 1/19/2024\nTime: ASAP\nAddress:  Flagger Force\nOrder: 574952\nNotes:  Please comment if you are available, thank you!',
 'Client: NPL - Washington, PA\nDate: 12/13/22\nAddress: 69 Market St Brownsville Pa \nOrder# 536569\nStart Time:  ASAP\nNotes: Please comment below or call the OSC if you are available.',
 'It’s suppose to storm tonight so I’m available @client_services_transition @client_services_pm @client_services_transition @client_services_pm',
 'Order: 577546\nClient: BGE Front Street\nAddress: 1101 Russell Street, Baltimore\nDate: 2/16/24\nStart: 19:00\n\n\nOrder: 577549\nClient: BGE Piney Orchard\nAddress: 730 New Waugh Chapel Road, Ondeton\nDate: 2/16/24\nStart: 19:00',
 'Cl available in cpa but willing to travel',
 'Pick Up: 220 8th Avenue NW, Gle

In [14]:
# Delete the client
PERSIST_DIRECTORY = "/home/ubuntu/thesis_GenAI/data/vector_db"

# Initialize the database
# vectordb_2datasets = chromadb.PersistentClient(path=PERSIST_DIRECTORY, 
#                                                settings=Settings(allow_reset=True,
#                                                ))                                                                 
                                                                 
# vectordb_2datasets.reset()

# Delete the collection
# vectordb_2datasets.delete_collection(name="collection_comments")
# vectordb_2datasets.delete_collection(name="collection_posts")

### 4.3 Create LangChain Objects and Retrievers

In [21]:
# Define the open-source embedding function ()
#embedding_func = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")
embedding_func = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# Passing a Chroma Client into Langchain
langchain_chroma_comments = Chroma(client=vectordb_2datasets,                        
                         collection_name="collection_comments",
                         embedding_function=embedding_func,
)
print("There are", langchain_chroma_comments._collection.count(), "in the collection of comments.")


# Passing a Chroma Client into Langchain
langchain_chroma_posts = Chroma(client=vectordb_2datasets,                        
                         collection_name="collection_posts",
                         embedding_function=embedding_func,
)
print("There are", langchain_chroma_posts._collection.count(), "in the collection of posts.")

There are 250 in the collection of comments.
There are 196 in the collection of posts.


In [22]:
# Metadata comments's description
metadata_field_info_comments = [
    AttributeInfo(
        name="stream_name",
        description="The stream where the comment was poste. One of ['Jobs','Water Break','Safety & Operations','OSC, We Can Help','Flagger Force Connect','The Whiteboard','Test Stream']",
        type="object",
    ),
    AttributeInfo(
        name="created_date",
        description="The datetime when the comment was posted",
        type="object",
    ),
    AttributeInfo(
        name="post_id",
        description="The id of the post where the comment was posted",
        type="int64",
    ),
    AttributeInfo(
        name="comment_id",
        description="The id of the comment",
        type="int64",
    ),
    AttributeInfo(
        name="like_count",
        description="The number of likes received by the comment",
        type="int64",        
    ),    
    AttributeInfo(
        name="report_count",
        description="The number of reports where the comment appears",
        type="int64",
    ),
    AttributeInfo(
        name="username",
        description="The username of the author of the comment",
        type="object",
    ),
    AttributeInfo(
        name="author_user_id",
        description="The id of the author of the comment",
        type="object",
    ),    
    AttributeInfo(
        name="author_position",
        description="The position of the author of the comment. One of ['Advanced Crew Leader','Crew Member','Crew Leader','Weekend Dispatch','Lead Instructor','Area Supervisor','Field Trainer 1','Warehouse Coordinator','Field Trainer 2','Executive Assistant','Internal Communications Manager','Field Manager','Safety Professional','Internal Communications Coordinator', 'Employee Services Supervisor','Safety Advocate', and many more...]",
        type="object",
    ),
    AttributeInfo(
        name="author_status",
        description="The status of the author of the comment. One of ['active','suspended']",
        type="object",
    ),
]



In [25]:
# Creating our self-querying retriever for comments
document_content_description = "Comments on social network posts"
#llm = ChatOllama(model="mistral")

from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_community.chat_models.ollama import ChatOllama
llm = ChatOllama(model="mistral")

retriever_comments = SelfQueryRetriever.from_llm(    
    llm,
    langchain_chroma_comments,
    document_content_description,    
    metadata_field_info_comments,
)

results_a = retriever_comments.invoke("What are some comments about birthday congratulations")
results_a

ConnectionError: HTTPConnectionPool(host='localhost', port=11434): Max retries exceeded with url: /api/chat (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fa82a239b70>: Failed to establish a new connection: [Errno 111] Connection refused'))

In [65]:
# Posts Metadata description
metadata_field_info_post = [
    AttributeInfo(
        name="stream_name",
        description="The stream where the comment was poste. One of ['Jobs','Water Break','Safety & Operations','OSC, We Can Help','Flagger Force Connect','The Whiteboard','Test Stream']",
        type="object",
    ),
    AttributeInfo(
        name="stream_id",
        description="The id of the stream where the post was posted",
        type="int64",
    ),
    AttributeInfo(
        name="post_id",
        description="The id of the post",
        type="int64",
    ),
    AttributeInfo(
        name="action",
        description="The id of the post where the comment was posted",
        type="object",
    ),
    AttributeInfo(
        name="created",
        description="The datetime when the post was create",
        type="object",
    ),
    AttributeInfo(
        name="title",
        description="The title of the post",
        type="object",
    ),
    AttributeInfo(
        name="like_count",
        description="The number of likes received by the post",
        type="int64",        
    ),
    AttributeInfo(
        name="comment_count",
        description="The number of comments received by the post",
        type="int64",        
    ),   
    AttributeInfo(
        name="mentions",
        description="The usernames that were mention in the post",
        type="object",
    ),        
    # AttributeInfo(
    #     name="labels",
    #     description="The keywords found in the post",
    #     type="int64",
    # ),
    AttributeInfo(
        name="username",
        description="The username of the author of the post",
        type="object",
    ),
    AttributeInfo(
        name="author_user_id",
        description="The id of the author of the post",
        type="object",
    ),    
    AttributeInfo(
        name="author_position",
        description="The position of the author of the post. One of ['Advanced Crew Leader','Crew Member','Crew Leader','Weekend Dispatch','Lead Instructor','Area Supervisor','Field Trainer 1','Warehouse Coordinator','Field Trainer 2','Executive Assistant','Internal Communications Manager','Field Manager','Safety Professional','Internal Communications Coordinator', 'Employee Services Supervisor','Safety Advocate', and many more...]",
        type="object",
    ),
    AttributeInfo(
        name="author_status",
        description="The status of the author of the post. One of ['active','suspended']",
        type="object",
    ),
]

In [66]:
# Creating our self-querying retriever
document_content_description = "Posts in social network"
llm = ChatOllama(model="mistral")

retriever_posts = SelfQueryRetriever.from_llm(    
    llm,
    langchain_chroma_posts,
    document_content_description,    
    metadata_field_info_post,
)

# This example only specifies a query
result_b = retriever_posts.invoke("What are some posts about birthday congratulations")
result_b

[Document(page_content='I need work on 7/4/23 and thank you', metadata={'action': 'POSTED', 'author_position': 'Crew Member', 'author_status': 'active', 'author_user_id': '2941f06f-db75-44cd-8422-668d8362125a', 'comment_count': 4, 'created': '2023-06-28 20:10:28', 'like_count': 1, 'post_id': 6762569, 'stream_id': 9254, 'stream_name': 'Jobs', 'title': 'Cm- Available', 'username': 'Cindy_Knight'}),
 Document(page_content='About 1 month ago, I had the chance to catch up with Crew Leader @Chaad_TerrySr to celebrate his 5 year anniversary.  Chaad is one of our key leaders supporting the PECO-Flagger Force partnership!  Thank you Chaad for 5 years of outstanding support to our clients, your focus on keeping our team members safe and providing a valuable service to our community!  Congratulations!', metadata={'action': 'POSTED', 'author_position': 'President & CEO', 'author_status': 'active', 'author_user_id': '314f3df5-80c3-432d-8446-666622b10fb6', 'comment_count': 19, 'created': '2022-03-28

In [67]:
# Define the ensemble retriever
from langchain.retrievers import EnsembleRetriever

# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[retriever_comments, retriever_posts], weights=[0.5, 0.5]
)

In [69]:
docs = ensemble_retriever.invoke("birthday congratulations")
docs

[Document(page_content='Happy Birthday', metadata={'author_position': 'Crew Member', 'author_status': 'suspended', 'author_user_id': 'c40c2d53-7c19-441e-b925-f07a3eaa8c9a', 'comment_id': 5500647, 'created_date': '2023-03-18 15:07:17', 'like_count': 1, 'post_id': 6365853, 'report_count': 0, 'stream_name': 'Water Break', 'username': 'Russell_Spicer'}),
 Document(page_content='I need work on 7/4/23 and thank you', metadata={'action': 'POSTED', 'author_position': 'Crew Member', 'author_status': 'active', 'author_user_id': '2941f06f-db75-44cd-8422-668d8362125a', 'comment_count': 4, 'created': '2023-06-28 20:10:28', 'like_count': 1, 'post_id': 6762569, 'stream_id': 9254, 'stream_name': 'Jobs', 'title': 'Cm- Available', 'username': 'Cindy_Knight'}),
 Document(page_content='Congratulations', metadata={'author_position': 'Crew Leader', 'author_status': 'suspended', 'author_user_id': 'c89cd870-1347-4a5d-acaa-d7abf1c7efa3', 'comment_id': 5671293, 'created_date': '2023-05-01 23:45:54', 'like_count

In [99]:
# Passing a Chroma Client into Langchain
langchain_chroma = Chroma(    
    client=vectordb_2datasets,
    collection_name="collection_comments",
    #embedding_function=embedding_func,
)

print("There are", langchain_chroma._collection.count(), "in the collection of comments.")
#print("There are", langchain_chroma._collection_posts.count(), "in the collection of posts.")

There are 200 in the collection of comments.


In [100]:
# Passing a Chroma Client into Langchain
langchain_chroma_post = Chroma(    
    client=vectordb_2datasets,
    collection_name="collection_posts",
    #embedding_function=embedding_func,
)

print("There are", langchain_chroma_post._collection.count(), "in the collection.")
#print("There are", langchain_chroma._collection_posts.count(), "in the collection of posts.")
langchain_chroma_post._collection

There are 50 in the collection.


Collection(name=collection_posts)

In [101]:
# Test the langchain_chroma_post
query = "What it got said about Jonathan_Muha"
docs = langchain_chroma_post.similarity_search(query)
print(docs[0].page_content)

ValueError: You must provide embeddings or a function to compute them

In [35]:
# Create a database

# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")

# save to disk
db = Chroma.from_documents(docs,
                           embedding_function,
                           #ids=ids,
                           persist_directory=PERSIST_DIRECTORY)

# Test the db
query = "What it got said about Jonathan_Muha"
docs = db.similarity_search(query)
print(docs[0].page_content)

Have a safe day, @Jonathan_Muha!


## 3. Creating our self-querying retriever

Now we can instantiate our retriever. To do this we’ll need to provide some information upfront about the metadata fields that our documents support and a short description of the document contents.

In [18]:
# Metadata description
metadata_field_info = [
    AttributeInfo(
        name="stream_name",
        description="The stream where the comment was poste. One of ['Jobs','Water Break','Safety & Operations','OSC, We Can Help','Flagger Force Connect','The Whiteboard','Test Stream']",
        type="object",
    ),
    AttributeInfo(
        name="created_date",
        description="The datetime when the comment was posted",
        type="object",
    ),
    AttributeInfo(
        name="post_id",
        description="The id of the post where the comment was posted",
        type="int64",
    ),
    AttributeInfo(
        name="comment_id",
        description="The id of the comment",
        type="int64",
    ),
    AttributeInfo(
        name="like_count",
        description="The number of likes received by the comment",
        type="int64",        
    ),    
    AttributeInfo(
        name="report_count",
        description="The number of reports where the comment appears",
        type="int64",
    ),
    AttributeInfo(
        name="username",
        description="The username of the author of the comment",
        type="object",
    ),
    AttributeInfo(
        name="author_user_id",
        description="The id of the author of the comment",
        type="object",
    ),    
    AttributeInfo(
        name="author_position",
        description="The position of the author of the comment. One of ['Advanced Crew Leader','Crew Member','Crew Leader','Weekend Dispatch','Lead Instructor','Area Supervisor','Field Trainer 1','Warehouse Coordinator','Field Trainer 2','Executive Assistant','Internal Communications Manager','Field Manager','Safety Professional','Internal Communications Coordinator', 'Employee Services Supervisor','Safety Advocate', and many more...]",
        type="object",
    ),
    AttributeInfo(
        name="author_status",
        description="The position of the author of the comment. One of ['active','suspended']",
        type="object",
    ),
]

# Creating our self-querying retriever
document_content_description = "Comments on social network posts"
llm = ChatOllama(model="mistral")

retriever = SelfQueryRetriever.from_llm(    
    llm,
    db,
    document_content_description,    
    metadata_field_info,
)

### 3.1 Testing out the retriever

In [22]:
# This example only specifies a query
retriever.invoke("What are two comments about congratulations")

[Document(page_content='Congratulations', metadata={'author_position': 'Crew Leader', 'author_status': 'suspended', 'author_user_id': 'e62d292b-7a01-4aa9-ab5a-e6fbfaa6e326', 'comment_id': 3927754, 'created_date': '2022-01-26 00:17:59', 'like_count': 0, 'post_id': 4989194, 'report_count': 0, 'stream_name': 'Safety & Operations', 'username': 'Donnie_Ziegler'}),
 Document(page_content='Congratulations', metadata={'author_position': 'Advanced Crew Leader', 'author_status': 'active', 'author_user_id': '5a3ca39e-5c41-4501-8eab-6a9ed0a62044', 'comment_id': 4686674, 'created_date': '2022-08-14 02:01:54', 'like_count': 0, 'post_id': 5643777, 'report_count': 0, 'stream_name': 'Safety & Operations', 'username': 'William_MobleyJr'}),
 Document(page_content='Congratulations', metadata={'author_position': 'Crew Member', 'author_status': 'suspended', 'author_user_id': '74f383df-5aa7-4efb-af99-5f96405dadc8', 'comment_id': 5163635, 'created_date': '2022-12-17 00:01:52', 'like_count': 0, 'post_id': 6082

In [23]:
df_comments_to_test['username'].value_counts()

username
FlaggerForce        8
weekend_dispatch    8
Karen_Stroup        8
Linwood_DavisJr     6
Jessica_Beers       5
                   ..
Deon_McDaniel       1
Adrienne_Long       1
Stephen_Michael     1
Towanda_Gordon      1
Kyndra_Edwards      1
Name: count, Length: 201, dtype: int64

In [28]:
# This example only specifies a filter
retriever.invoke("What is a highly liked (above 4) comment")

[Document(page_content='What did the buffalo say to his son when he dropped him off at school?\nBison!!!! 😂🤣😭 @Jeffrey_Williams @Seth_Lee 😂🤣😭', metadata={'author_position': 'Field Trainer 1', 'author_status': 'suspended', 'author_user_id': 'f1555029-1597-4d3e-9199-4ebc04ed9702', 'comment_id': 4695288, 'created_date': '2022-08-16 19:19:26', 'like_count': 6, 'post_id': 5653172, 'report_count': 0, 'stream_name': 'Water Break', 'username': 'Collette_Monaghan'}),
 Document(page_content='As @Julie_Snedeker mentioned the focus here is to stop tampering with Flagger Forced issued equipment. The cameras believe it or not are in the vehicles for YOUR SAFETY. If you pick up the cell phone it says to put it down... if your following to close it says to back off... it is easy to sit and nitpick a photo but it is difficult to follow company policy.', metadata={'author_position': 'Safety Manager', 'author_status': 'active', 'author_user_id': '2aac35e8-f970-4434-9b46-56274384cf89', 'comment_id': 66010

In [27]:
# group by to find the comments with highest number of likes
df_comments_to_test.groupby('comment_id')['like_count'].sum().sort_values(ascending=False)

comment_id
6601009    6
4695288    6
5959837    4
3993841    4
6468179    4
          ..
5031138    0
5049534    0
5067432    0
5068273    0
6865894    0
Name: like_count, Length: 300, dtype: int64

In [None]:
# Deleting the client
import chromadb
from chromadb.config import Settings
PERSIST_DIRECTORY = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\datasets_db"

chromadb_client = chromadb.PersistentClient(path=PERSIST_DIRECTORY, settings=Settings(allow_reset=True))
chromadb_client.reset()

In [None]:
query = "Thank you Robert, I appreciate your knowledge"
docs = db.similarity_search(query)
print(docs[0].page_content)


In [None]:
import chromadb
from chromadb.config import Settings
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings


PERSIST_DIRECTORY = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\datasets_db"
# embeddings = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")
embeddings = OllamaEmbeddings(model="mistral") 

persistent_client = chromadb.PersistentClient(path=PERSIST_DIRECTORY, settings=Settings(allow_reset=True))
collection_comments = persistent_client.get_or_create_collection("collection_comments",
                                                        metadata={"hnsw:space": "l2"}, # Squared L2 norm(l2) is the default, inner product('ip') or cosine similarity('cosine')                                                              
                                                        embedding_function=embeddings
                                                        )
 # Add data to collection
collection_comments.add(documents=documents,
                        metadatas=metadatas,
                        ids=ids
                        )

langchain_chroma = Chroma(
    client=persistent_client,
    collection_name="collection_comments",
    embedding_function=embeddings
)

print("There are", langchain_chroma._collection.count(), "in the collection")

In [None]:
# Deleting the client
import chromadb
from chromadb.config import Settings
PERSIST_DIRECTORY = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\datasets_db"

chromadb_client = chromadb.PersistentClient(path=PERSIST_DIRECTORY, settings=Settings(allow_reset=True))
chromadb_client.reset()

In [None]:
import chromadb
#from langchain_chroma import Chroma
#from langchain_community.embeddings import OllamaEmbeddings
#import chromadb.utils.embedding_functions as embedding_functions
from chromadb.config import Settings
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma

      
PERSIST_DIRECTORY = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\datasets_db"
embeddings = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")    
    

# Function to create and load data to the database
"""
The embedding function takes text as input, and performs tokenization and embedding. If no embedding function is supplied, Chroma will use sentence transformer as a default.
https://docs.trychroma.com/embeddings#sentence-transformers
By default, Chroma uses all-MiniLM-L6-v2
"""


def upload_to_chromadb(documents, metadatas, ids):
    try:            
        # create the open-source embedding function
        # embedding = OllamaEmbeddings(model="mistral")
        # embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")   # Default
        embeddings = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")          # Best model from included in ChromaDB
        
        # Initialize ChromaDB client
        # load it into Chroma
        db = Chroma.from_documents(documents, embeddings,
                                    persist_directory=PERSIST_DIRECTORY)
               
        # Create a comment's collection
        collection_comments = chromadb_client.create_collection(name="comments_collection",
                                                              metadata={"hnsw:space": "l2"}, # Squared L2 norm(l2) is the default, inner product('ip') or cosine similarity('cosine')                                                              
                                                              embedding_function=embeddings
                                                              )        
        
        # Add data to collection
        collection_comments.add(documents=documents,
                                metadatas=metadatas,
                                ids=ids
                                )
        print("Data uploaded successfully.")
    except Exception as e:
        print(f"Failed to upload data: {e}")

upload_to_chromadb(documents, metadatas, ids)