# Artefact II - RAG using Two Collections

## 1. Initial Setup

In [1]:
# Python 
import os
import pandas as pd

# ChromaDB
import chromadb
from chromadb.config import Settings

# LangChain 
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_community.chat_models import ChatOllama


# Directory path
#PERSIST_DIRECTORY = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\vector_db"
PERSIST_DIRECTORY = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\datasets_db"

# Define the open-source embedding function ()
embedding_func = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")


### 1.1 Importing Comments

In [42]:
# Import users file
#path = "/home/ubuntu/thesis_GenAI/data/production_datasets/cleaned_datasets"
path = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\production_datasets\\cleaned_datasets"
file_name = "comments_cleaned.csv"
file_path = os.path.join(path, file_name)
df_comments = pd.read_csv(file_path)

# Create a sample of comments to test
df_comments_to_test = df_comments.sample(250, random_state=123)
df_comments_to_test.to_csv(os.path.join(path,"comments_sample.csv"), index=False)

# Print a sample
df_comments.sample(3)

Unnamed: 0,stream_name,created_date,post_id,comment_id,comment_text,like_count,report_count,username,author_user_id,author_position,author_status
127166,Jobs,2022-03-13 20:02:56,5135062,4105563,@Maynor_Hernandez,3,0,Ebony_Scott,8a900aa7-4e78-41e8-ba57-e8c8c5a4936b,Crew Member,active
8931,Jobs,2024-01-14 21:42:26,7434810,6662222,Available,0,0,Malik_Samuel,61da1d59-dbc8-4ff2-97f1-63bf993c8345,Crew Leader,active
127682,Jobs,2022-03-11 03:05:02,5127805,4096669,Me,0,0,Hugh_Morton,8f7aef8b-0baa-4ae3-823f-86013b8c6327,Crew Member,active


### 1.2 Importing Posts

In [41]:
# Import users file
#path = "/home/ubuntu/thesis_GenAI/data/production_datasets/cleaned_datasets"
path = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\production_datasets\\cleaned_datasets"
file_name = "posts_cleaned.csv"
file_path = os.path.join(path, file_name)
df_posts = pd.read_csv(file_path)

# Delete column 'labels'
df_posts = df_posts.drop(columns=['labels'])
# Rename comment_count column
df_posts.rename({'comment_countt': 'comment_count'}, axis=1, inplace=True)
# Replace in column 'text' the value '\u200d♀️' by ''
df_posts['text'] = df_posts['text'].str.replace('\u200d♀️', '')

# Create a sample of 50,000 comments to test
# set a seed

df_post_to_test = df_posts.sample(200, random_state=123)
df_post_to_test.to_csv(os.path.join(path,"posts_sample.csv"), index=False)

# Print a sample
df_posts.sample(3)

Unnamed: 0,stream_name,stream_id,post_id,action,created,title,text,like_count,comment_count,mentions,username,author_user_id,author_position,author_status
2016,Water Break,6787,7462458,POSTED,2024-01-24 12:25:01,"🙌🥳 A KUDOS, for ME!? | Give A KUDOS, Get A KUD...",🌟Happy National Compliment Day!🙌\nYou're all S...,10,67,,FlaggerForce,b701ab9f-563a-4425-a389-aff803a8da58,,active
21002,Jobs,9254,6185156,POSTED,2023-01-22 20:50:51,Am I getting a crew member,Am I getting a crew member for next week mon t...,5,20,,Jason_Miller,3e0769cd-6f9b-4ec0-ad26-310e6efa1381,Advanced Crew Leader,active
19096,Jobs,9254,6308563,POSTED,2023-03-02 00:17:38,Need of 2 members,,0,1,,Dawn_Burdett,2a5bc351-88a7-48b0-b13f-d4e3a7194fcd,Crew Leader,active


## 2. Prepare Data for Embeddings

### 2.1 Comments

In [43]:
# Import comment's sample file
file_name = "comments_sample.csv"
file_path = os.path.join(path, file_name)
df_comments_to_test = pd.read_csv(file_path)

# Function to prepare the commments data to be loaded to the database
def prepare_comments_data(df):   
    documents = []
    metadatas = []
    ids = []
        
    for idx, row in df.iterrows(): 
        documents.append(row.iloc[4])
        metadatas.append({
            'stream_name': row.iloc[0],
            'created_date': row.iloc[1],
            'post_id': row.iloc[2],
            'comment_id': row.iloc[3],
            'like_count': row.iloc[5],
            'report_count': row.iloc[6],
            'username': row.iloc[7],
            'author_user_id': row.iloc[8],
            'author_position': row.iloc[9],
            'author_status': row.iloc[10]
        })
        ids.append(str(idx+1))       
    
    return documents, metadatas, ids

documents_comments, metadatas_comments, ids_comments = prepare_comments_data(df_comments_to_test)

print(type(documents_comments))
documents_comments[:10]

<class 'list'>


['Ok',
 '@Waylon_Curtis',
 'Wow.... you too',
 '**This order has been cancelled**',
 'We can not control what other companies do. It is not our place. \n    \n    I even commented that it was a good video to remind ourselves that WE can and will do better based upon our training.',
 "I'm available",
 "@Delano_Haines... It was great to see you today. I'm happy to hear that Seth and his crew is pleased with us. As for training  @Natisha_Nwankwo and @Jeffrey_Brodsky .. they make my job easy and fun. They are both very eager to learn everything. Btw, today I personally didn't set the cones. Jeff set the transition and tangent, and Natisha set the termination with a little bit of assistance. I'm very proud of both of them.  I'll keep on doing what I do best. \nI appreciate the kind words.",
 'What is the scope of this job',
 'Congratulations and welcome back!! \n🎉👷🏼\u200d♀️🛑🎉',
 '@Portia_Dougherty it’s for crew lead and up only']

In [44]:
# split it into chunks
text_splitter = CharacterTextSplitter(    
    separator="\n\n",
    chunk_size=500,
    chunk_overlap=0,
    length_function=len,
    is_separator_regex=False,        
    )

docs_comments = text_splitter.create_documents(documents_comments,
                                               metadatas=metadatas_comments
                                                )
docs_comments[:2]  

[Document(page_content='Ok', metadata={'stream_name': 'Jobs', 'created_date': '2022-04-02 13:36:11', 'post_id': 5195364, 'comment_id': 4179344, 'like_count': 1, 'report_count': 0, 'username': 'Christopher_Young', 'author_user_id': '940f777e-ebc8-4bae-b1c8-aecb92a62438', 'author_position': 'Crew Leader', 'author_status': 'suspended'}),
 Document(page_content='@Waylon_Curtis', metadata={'stream_name': 'Jobs', 'created_date': '2022-11-13 21:47:13', 'post_id': 5974128, 'comment_id': 5033739, 'like_count': 0, 'report_count': 0, 'username': 'John_Ware', 'author_user_id': 'e93ab132-8a53-4a66-89a6-b256e82b7956', 'author_position': 'Advanced Crew Leader', 'author_status': 'suspended'})]

### 2.2 Posts

In [45]:
# Import post's sample file
file_name = "posts_sample.csv"
file_path = os.path.join(path, file_name)
df_posts_to_test = pd.read_csv(file_path)
# print(df_posts_to_test.isnull().sum())

# Delete rows with NAN values in the column text
df_posts_to_test.dropna(inplace=True, subset=['text','username'])
# print()
print(df_posts_to_test.isnull().sum())

stream_name          0
stream_id            0
post_id              0
action               0
created              0
title               31
text                 0
like_count           0
comment_count        0
mentions           166
username             0
author_user_id       0
author_position     39
author_status        0
dtype: int64


In [46]:
# print the rows where the column title is NAN
print(df_posts_to_test[df_posts_to_test['title'].isnull()])

          stream_name  stream_id  post_id  action              created title  \
0                Jobs       9254  6988496  POSTED  2023-09-04 08:26:06   NaN   
1         Water Break       6787  5066545  POSTED  2022-02-21 14:38:06   NaN   
12        Water Break       6787  6608939  POSTED  2023-05-19 21:50:56   NaN   
13               Jobs       9254  5584091  POSTED  2022-07-28 10:25:56   NaN   
29               Jobs       9254  5772855  POSTED  2022-09-17 19:59:06   NaN   
42               Jobs       9254  7301007  POSTED  2023-12-03 23:24:38   NaN   
44               Jobs       9254  6517720  POSTED  2023-04-29 16:35:26   NaN   
48        Water Break       6787  7374927  POSTED  2023-12-25 19:46:45   NaN   
52               Jobs       9254  7295653  POSTED  2023-12-01 19:12:00   NaN   
57               Jobs       9254  6534887  POSTED  2023-05-02 19:39:22   NaN   
60               Jobs       9254  5718730  POSTED  2022-09-02 15:12:39   NaN   
62               Jobs       9254  564527

In [47]:
# Function to prepare the posts data to be loaded to the database
def prepare_posts_data(df):   
    documents = []
    metadatas = []
    ids = []
        
    for idx, row in df.iterrows(): 
        documents.append(row.iloc[6])
        metadatas.append({
            'stream_name': row.iloc[0],
            'stream_id': row.iloc[1],
            'post_id': row.iloc[2],
            'action': row.iloc[3],            
            'created': row.iloc[4],
            'title': row.iloc[5],            
            'like_count': row.iloc[7],
            'comment_count': row.iloc[8],
            'mentions': row.iloc[9],
            #'labels': row.iloc[10],
            'username': row.iloc[10],
            'author_user_id': row.iloc[11],
            'author_position': row.iloc[12],
            'author_status': row.iloc[13]
        })
        ids.append(str(idx+1))       
    
    return documents, metadatas, ids

documents_posts, metadatas_posts, ids_posts = prepare_posts_data(df_posts_to_test)

print(type(documents_posts))
documents_posts[:10]

<class 'list'>


['I’m up now I’m available @client_services_am @weekend_dispatch',
 'BEEN UNDER THE WEATHER FOR A FEW DAYS BUT I CAN’T COMPLAIN CAUSE I’M STILL LIVING & ABLE….LET’S MAKE TODAY BETTER THAN YESTERDAY',
 'Client: Verizon - MD\nDate: 1/19/2024\nTime: ASAP\nAddress:  Flagger Force\nOrder: 574952\nNotes:  Please comment if you are available, thank you!',
 'Client: NPL - Washington, PA\nDate: 12/13/22\nAddress: 69 Market St Brownsville Pa \nOrder# 536569\nStart Time:  ASAP\nNotes: Please comment below or call the OSC if you are available.',
 'It’s suppose to storm tonight so I’m available @client_services_transition @client_services_pm @client_services_transition @client_services_pm',
 'Order: 577546\nClient: BGE Front Street\nAddress: 1101 Russell Street, Baltimore\nDate: 2/16/24\nStart: 19:00\n\n\nOrder: 577549\nClient: BGE Piney Orchard\nAddress: 730 New Waugh Chapel Road, Ondeton\nDate: 2/16/24\nStart: 19:00',
 'Cl available in cpa but willing to travel',
 'Pick Up: 220 8th Avenue NW, Gle

In [8]:
""" # find the elements of the list documents_posts which are not of string type
for element in documents_posts:
    if not isinstance(element, str):
        print(documents_posts.index(element))
        
# Deleting the elements of the list with NaN values
documents_posts = [element for element in documents_posts if isinstance(element, str)] """

' # find the elements of the list documents_posts which are not of string type\nfor element in documents_posts:\n    if not isinstance(element, str):\n        print(documents_posts.index(element))\n        \n# Deleting the elements of the list with NaN values\ndocuments_posts = [element for element in documents_posts if isinstance(element, str)] '

In [48]:
# Text splitter

# split it into chunks
text_splitter = CharacterTextSplitter(    
    separator="\n\n",
    chunk_size=1000,
    chunk_overlap=0,
    length_function=len,
    is_separator_regex=False,        
    )

docs_posts = text_splitter.create_documents(documents_posts, 
                                            metadatas=metadatas_posts
                                            )
docs_posts[:2]  

[Document(page_content='I’m up now I’m available @client_services_am @weekend_dispatch', metadata={'stream_name': 'Jobs', 'stream_id': 9254, 'post_id': 6988496, 'action': 'POSTED', 'created': '2023-09-04 08:26:06', 'title': nan, 'like_count': 4, 'comment_count': 0, 'mentions': 'client_services_am,weekend_dispatch', 'username': 'Latasha_Clayton', 'author_user_id': '6e89af7c-c527-493a-93f7-a8c0205c331d', 'author_position': 'Advanced Crew Leader', 'author_status': 'suspended'}),
 Document(page_content='BEEN UNDER THE WEATHER FOR A FEW DAYS BUT I CAN’T COMPLAIN CAUSE I’M STILL LIVING & ABLE….LET’S MAKE TODAY BETTER THAN YESTERDAY', metadata={'stream_name': 'Water Break', 'stream_id': 6787, 'post_id': 5066545, 'action': 'POSTED', 'created': '2022-02-21 14:38:06', 'title': nan, 'like_count': 8, 'comment_count': 6, 'mentions': nan, 'username': 'Dontavius_Smith', 'author_user_id': '93f404c0-922b-49f9-b1cb-ecdbc0703e37', 'author_position': 'Area Supervisor', 'author_status': 'active'})]

## 4. Create and add data to database

### 4.1 First approach: Load data with LangChain

In [11]:
# First approach: Set the client with chromadb but load data with Chroma from LangChain

""" import chromadb
from chromadb.config import Settings

client_settings = chromadb.config.Settings(        
    persist_directory=PERSIST_DIRECTORY,
    anonymized_telemetry=False,
    allow_reset=True
)

# save comments to disk
db_comments = Chroma.from_documents(docs_comments,
                           embedding_func,                           
                           persist_directory=PERSIST_DIRECTORY)

# Test the db
query = "FlaggerForce congratulations"
docs = db_comments.similarity_search(query)
print(docs[0].page_content) """

Welcome to the best team u can have at flagger force


In [12]:
# This is here just as documentation

""" # save posts to disk
db_posts = Chroma.from_documents(docs_posts,
                           embedding_func,                           
                           persist_directory=PERSIST_DIRECTORY)

# Test the db
query = "FlaggerForce congratulations"
docs = db_posts.similarity_search(query)
print(docs[0].page_content) """

Anyways, Flagger force has opened up my eyes to a whole new future and I can’t be more excited for the journey I have ahead of me. Knowing that working in SC will be a few week project, I can’t help but feel honored to be selected as one of the few to go down and represent FF in hopes to obtain more work in the area. WE GOTTA SHINE AS BRIGHT AS OUR PPE. 😂😂. 

shout out to @Danny_Rice and his team granted I can be a pain and run my mouth occasionally, y’all’s hard work doesn’t go unnoticed. I really do appreciate  the hard work and effort y’all put in. I look forward to making y’all proud. 

well if you made it this far, stay safe Ff family! 

@Keionna_Lee @Ayeisha_Forbes @Matthew_Geis @Kyle_Schall


### 4.2 Second Approach: Load data with Chromadb

In [58]:
# Create db and comment's collection
PERSIST_DIRECTORY = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\vector_db"

# Initialize the database
vectordb_2datasets = chromadb.PersistentClient(path=PERSIST_DIRECTORY, 
                                               settings=Settings(allow_reset=True,
                                                                 ))
# Define the open-source embedding function ()
#embedding_func = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")
embedding_func = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# Create a collection for the comments
collection_comments = vectordb_2datasets.get_or_create_collection(name="collection_comments",
                                                                  #embedding_function=embedding_func
                                                                  )
collection_comments.add(documents=documents_comments,
                        metadatas=metadatas_comments,
                        ids=ids_comments
                        )

In [59]:
# returns a list of the first 10 items in the collection
collection_comments.peek()['documents']

['Ok',
 '@Waylon_Curtis',
 'Wow.... you too',
 '**This order has been cancelled**',
 'We can not control what other companies do. It is not our place. \n    \n    I even commented that it was a good video to remind ourselves that WE can and will do better based upon our training.',
 "I'm available",
 "@Delano_Haines... It was great to see you today. I'm happy to hear that Seth and his crew is pleased with us. As for training  @Natisha_Nwankwo and @Jeffrey_Brodsky .. they make my job easy and fun. They are both very eager to learn everything. Btw, today I personally didn't set the cones. Jeff set the transition and tangent, and Natisha set the termination with a little bit of assistance. I'm very proud of both of them.  I'll keep on doing what I do best. \nI appreciate the kind words.",
 'What is the scope of this job',
 'Congratulations and welcome back!! \n🎉👷🏼\u200d♀️🛑🎉',
 '@Portia_Dougherty it’s for crew lead and up only']

In [60]:
# Create a collection for the posts
collection_posts = vectordb_2datasets.get_or_create_collection(name="collection_posts",
                                                               )
collection_posts.add(documents=documents_posts,
                     metadatas=metadatas_posts,
                     ids=ids_posts)

In [61]:
# returns a list of the first 10 items in the collection
collection_posts.peek()['documents']

['I’m up now I’m available @client_services_am @weekend_dispatch',
 'BEEN UNDER THE WEATHER FOR A FEW DAYS BUT I CAN’T COMPLAIN CAUSE I’M STILL LIVING & ABLE….LET’S MAKE TODAY BETTER THAN YESTERDAY',
 'Client: Verizon - MD\nDate: 1/19/2024\nTime: ASAP\nAddress:  Flagger Force\nOrder: 574952\nNotes:  Please comment if you are available, thank you!',
 'Client: NPL - Washington, PA\nDate: 12/13/22\nAddress: 69 Market St Brownsville Pa \nOrder# 536569\nStart Time:  ASAP\nNotes: Please comment below or call the OSC if you are available.',
 'It’s suppose to storm tonight so I’m available @client_services_transition @client_services_pm @client_services_transition @client_services_pm',
 'Order: 577546\nClient: BGE Front Street\nAddress: 1101 Russell Street, Baltimore\nDate: 2/16/24\nStart: 19:00\n\n\nOrder: 577549\nClient: BGE Piney Orchard\nAddress: 730 New Waugh Chapel Road, Ondeton\nDate: 2/16/24\nStart: 19:00',
 'Cl available in cpa but willing to travel',
 'Pick Up: 220 8th Avenue NW, Gle

In [57]:
# Delete the client
# vectordb_2datasets.reset()

# Delete the collection
# vectordb_2datasets.delete_collection(name="collection_comments")
# vectordb_2datasets.delete_collection(name="collection_posts")

True

### 4.3 Create LangChain Objects and Retrievers

In [62]:
# Define the open-source embedding function ()
#embedding_func = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")
embedding_func = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# Passing a Chroma Client into Langchain
langchain_chroma_comments = Chroma(client=vectordb_2datasets,                        
                         collection_name="collection_comments",
                         embedding_function=embedding_func,
)
print("There are", langchain_chroma_comments._collection.count(), "in the collection of comments.")


# Passing a Chroma Client into Langchain
langchain_chroma_posts = Chroma(client=vectordb_2datasets,                        
                         collection_name="collection_posts",
                         embedding_function=embedding_func,
)
print("There are", langchain_chroma_posts._collection.count(), "in the collection of posts.")

There are 250 in the collection of comments.
There are 196 in the collection of posts.


In [63]:
# Metadata comments's description
metadata_field_info_comments = [
    AttributeInfo(
        name="stream_name",
        description="The stream where the comment was poste. One of ['Jobs','Water Break','Safety & Operations','OSC, We Can Help','Flagger Force Connect','The Whiteboard','Test Stream']",
        type="object",
    ),
    AttributeInfo(
        name="created_date",
        description="The datetime when the comment was posted",
        type="object",
    ),
    AttributeInfo(
        name="post_id",
        description="The id of the post where the comment was posted",
        type="int64",
    ),
    AttributeInfo(
        name="comment_id",
        description="The id of the comment",
        type="int64",
    ),
    AttributeInfo(
        name="like_count",
        description="The number of likes received by the comment",
        type="int64",        
    ),    
    AttributeInfo(
        name="report_count",
        description="The number of reports where the comment appears",
        type="int64",
    ),
    AttributeInfo(
        name="username",
        description="The username of the author of the comment",
        type="object",
    ),
    AttributeInfo(
        name="author_user_id",
        description="The id of the author of the comment",
        type="object",
    ),    
    AttributeInfo(
        name="author_position",
        description="The position of the author of the comment. One of ['Advanced Crew Leader','Crew Member','Crew Leader','Weekend Dispatch','Lead Instructor','Area Supervisor','Field Trainer 1','Warehouse Coordinator','Field Trainer 2','Executive Assistant','Internal Communications Manager','Field Manager','Safety Professional','Internal Communications Coordinator', 'Employee Services Supervisor','Safety Advocate', and many more...]",
        type="object",
    ),
    AttributeInfo(
        name="author_status",
        description="The status of the author of the comment. One of ['active','suspended']",
        type="object",
    ),
]

# Creating our self-querying retriever
document_content_description = "Comments on social network posts"
llm = ChatOllama(model="mistral")

retriever_comments = SelfQueryRetriever.from_llm(    
    llm,
    langchain_chroma_comments,
    document_content_description,    
    metadata_field_info_comments,
)

In [64]:
# This example only specifies a query
results_a = retriever_comments.invoke("What are some comments about birthday congratulations")
results_a

[Document(page_content='Happy Birthday', metadata={'author_position': 'Crew Member', 'author_status': 'suspended', 'author_user_id': 'c40c2d53-7c19-441e-b925-f07a3eaa8c9a', 'comment_id': 5500647, 'created_date': '2023-03-18 15:07:17', 'like_count': 1, 'post_id': 6365853, 'report_count': 0, 'stream_name': 'Water Break', 'username': 'Russell_Spicer'}),
 Document(page_content='Congratulations', metadata={'author_position': 'Crew Leader', 'author_status': 'suspended', 'author_user_id': 'c89cd870-1347-4a5d-acaa-d7abf1c7efa3', 'comment_id': 5671293, 'created_date': '2023-05-01 23:45:54', 'like_count': 0, 'post_id': 6515225, 'report_count': 0, 'stream_name': 'Safety & Operations', 'username': 'Shannon_Eachus'}),
 Document(page_content='🙏happy birthday 🙏', metadata={'author_position': 'Crew Member', 'author_status': 'suspended', 'author_user_id': 'c40c2d53-7c19-441e-b925-f07a3eaa8c9a', 'comment_id': 5915651, 'created_date': '2023-06-30 18:03:57', 'like_count': 1, 'post_id': 6770961, 'report_co

In [65]:
# Posts Metadata description
metadata_field_info_post = [
    AttributeInfo(
        name="stream_name",
        description="The stream where the comment was poste. One of ['Jobs','Water Break','Safety & Operations','OSC, We Can Help','Flagger Force Connect','The Whiteboard','Test Stream']",
        type="object",
    ),
    AttributeInfo(
        name="stream_id",
        description="The id of the stream where the post was posted",
        type="int64",
    ),
    AttributeInfo(
        name="post_id",
        description="The id of the post",
        type="int64",
    ),
    AttributeInfo(
        name="action",
        description="The id of the post where the comment was posted",
        type="object",
    ),
    AttributeInfo(
        name="created",
        description="The datetime when the post was create",
        type="object",
    ),
    AttributeInfo(
        name="title",
        description="The title of the post",
        type="object",
    ),
    AttributeInfo(
        name="like_count",
        description="The number of likes received by the post",
        type="int64",        
    ),
    AttributeInfo(
        name="comment_count",
        description="The number of comments received by the post",
        type="int64",        
    ),   
    AttributeInfo(
        name="mentions",
        description="The usernames that were mention in the post",
        type="object",
    ),        
    # AttributeInfo(
    #     name="labels",
    #     description="The keywords found in the post",
    #     type="int64",
    # ),
    AttributeInfo(
        name="username",
        description="The username of the author of the post",
        type="object",
    ),
    AttributeInfo(
        name="author_user_id",
        description="The id of the author of the post",
        type="object",
    ),    
    AttributeInfo(
        name="author_position",
        description="The position of the author of the post. One of ['Advanced Crew Leader','Crew Member','Crew Leader','Weekend Dispatch','Lead Instructor','Area Supervisor','Field Trainer 1','Warehouse Coordinator','Field Trainer 2','Executive Assistant','Internal Communications Manager','Field Manager','Safety Professional','Internal Communications Coordinator', 'Employee Services Supervisor','Safety Advocate', and many more...]",
        type="object",
    ),
    AttributeInfo(
        name="author_status",
        description="The status of the author of the post. One of ['active','suspended']",
        type="object",
    ),
]

# Creating our self-querying retriever
document_content_description = "Posts in social network"
llm = ChatOllama(model="mistral")

retriever_posts = SelfQueryRetriever.from_llm(    
    llm,
    langchain_chroma_posts,
    document_content_description,    
    metadata_field_info_post,
)

In [38]:
collection_comments.peek(250)['documents']

['@Rebecca_Kennett I appreciate that. I have what I’m pretty sure is a kidney stone. I’m not doing good at all today. I was ignoring it. Trying to. But it’s gotten intense in the past few hours. It takes a lot to knock me down but this is bad. 😞 thankful for my clients and how much they care about me today. ❤️ that’s the only positive. Lol',
 '@James_Carney absolutely!',
 "They didn't have it this year",
 "Charles thank you for taking time away from home to support our Atlanta team, looks like they can't wait for you to come back!  Well done.",
 'I’m ready',
 'Congratulations',
 'I HOPE YOU ALL HAVE A GREAT 2023… HOPE EVERYTHING THAT YOU’VE WORKED & PRAYED FOR COMES TO LIFE… @Collette_Monaghan @Angela_Falcon @Katie_Williams',
 'Sorry to hear that',
 "I am so happy for you and your little girl may she have a long and healthy and happy life and hopefully she doesn't have to go through anything like that again",
 'Authoring just 1 paper can be daunting and is remarked as impressive. 135 i

In [37]:
collection_posts.peek(193)['documents']

['Anytime and any day for anything',
 'Available',
 'Acl available the rest of the week night🌚and day 🌝',
 'Client: BGE\nDate: 8/11/2023\nTime: multiple\nAddress:\n730 New Waugh Chapel Road Odenton, MD - 1 CM - restoration - 7AM\n3602 W Rodgers Ave Baltimore, MD - PSSB - 10 AM \nOrder# multiple\nNotes: storm duty order\nPlease respond below if you are interested.  Thank you!!!!!',
 "Good Morning Team!! If you'll notice, there isn't a Ford or Dodge emblem hanging there. Be vigilant while driving, morning, noon and night. His comrades are one the move!! Stay Safety Driven!! Crew Members, please be the second set of eyes when you're in the truck with your leads!! Have a Safe and Hydrated Day!!👷🏼\u200d♀️🛑💦",
 'Order #: 569479\nClient: Infrasource Electric\nDate: 12/26/23\nStart Time:  7:00\nAddress:  155 N Main St, Sugarloaf, PA 18249\n\n\nPLEASE COMMENT IF AVAILABLE',
 'Available this weekend and all next week',
 'Available for Cl or Cm',
 'Me and @Rickey_Goodwin  available!!!! ',
 'Crew 

In [39]:
documents_posts

['Anytime and any day for anything',
 'Available',
 'Acl available the rest of the week night🌚and day 🌝',
 'Client: BGE\nDate: 8/11/2023\nTime: multiple\nAddress:\n730 New Waugh Chapel Road Odenton, MD - 1 CM - restoration - 7AM\n3602 W Rodgers Ave Baltimore, MD - PSSB - 10 AM \nOrder# multiple\nNotes: storm duty order\nPlease respond below if you are interested.  Thank you!!!!!',
 "Good Morning Team!! If you'll notice, there isn't a Ford or Dodge emblem hanging there. Be vigilant while driving, morning, noon and night. His comrades are one the move!! Stay Safety Driven!! Crew Members, please be the second set of eyes when you're in the truck with your leads!! Have a Safe and Hydrated Day!!👷🏼\u200d♀️🛑💦",
 'Order #: 569479\nClient: Infrasource Electric\nDate: 12/26/23\nStart Time:  7:00\nAddress:  155 N Main St, Sugarloaf, PA 18249\n\n\nPLEASE COMMENT IF AVAILABLE',
 'Available this weekend and all next week',
 'Available for Cl or Cm',
 'Me and @Rickey_Goodwin  available!!!! ',
 'Crew 

In [66]:
# This example only specifies a query
result_b = retriever_posts.invoke("What are some posts about birthday congratulations")
result_b

[Document(page_content='I need work on 7/4/23 and thank you', metadata={'action': 'POSTED', 'author_position': 'Crew Member', 'author_status': 'active', 'author_user_id': '2941f06f-db75-44cd-8422-668d8362125a', 'comment_count': 4, 'created': '2023-06-28 20:10:28', 'like_count': 1, 'post_id': 6762569, 'stream_id': 9254, 'stream_name': 'Jobs', 'title': 'Cm- Available', 'username': 'Cindy_Knight'}),
 Document(page_content='About 1 month ago, I had the chance to catch up with Crew Leader @Chaad_TerrySr to celebrate his 5 year anniversary.  Chaad is one of our key leaders supporting the PECO-Flagger Force partnership!  Thank you Chaad for 5 years of outstanding support to our clients, your focus on keeping our team members safe and providing a valuable service to our community!  Congratulations!', metadata={'action': 'POSTED', 'author_position': 'President & CEO', 'author_status': 'active', 'author_user_id': '314f3df5-80c3-432d-8446-666622b10fb6', 'comment_count': 19, 'created': '2022-03-28

In [67]:
# Define the ensemble retriever
from langchain.retrievers import EnsembleRetriever

# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[retriever_comments, retriever_posts], weights=[0.5, 0.5]
)

In [69]:
docs = ensemble_retriever.invoke("birthday congratulations")
docs

[Document(page_content='Happy Birthday', metadata={'author_position': 'Crew Member', 'author_status': 'suspended', 'author_user_id': 'c40c2d53-7c19-441e-b925-f07a3eaa8c9a', 'comment_id': 5500647, 'created_date': '2023-03-18 15:07:17', 'like_count': 1, 'post_id': 6365853, 'report_count': 0, 'stream_name': 'Water Break', 'username': 'Russell_Spicer'}),
 Document(page_content='I need work on 7/4/23 and thank you', metadata={'action': 'POSTED', 'author_position': 'Crew Member', 'author_status': 'active', 'author_user_id': '2941f06f-db75-44cd-8422-668d8362125a', 'comment_count': 4, 'created': '2023-06-28 20:10:28', 'like_count': 1, 'post_id': 6762569, 'stream_id': 9254, 'stream_name': 'Jobs', 'title': 'Cm- Available', 'username': 'Cindy_Knight'}),
 Document(page_content='Congratulations', metadata={'author_position': 'Crew Leader', 'author_status': 'suspended', 'author_user_id': 'c89cd870-1347-4a5d-acaa-d7abf1c7efa3', 'comment_id': 5671293, 'created_date': '2023-05-01 23:45:54', 'like_count

In [99]:
# Passing a Chroma Client into Langchain
langchain_chroma = Chroma(    
    client=vectordb_2datasets,
    collection_name="collection_comments",
    #embedding_function=embedding_func,
)

print("There are", langchain_chroma._collection.count(), "in the collection of comments.")
#print("There are", langchain_chroma._collection_posts.count(), "in the collection of posts.")

There are 200 in the collection of comments.


In [100]:
# Passing a Chroma Client into Langchain
langchain_chroma_post = Chroma(    
    client=vectordb_2datasets,
    collection_name="collection_posts",
    #embedding_function=embedding_func,
)

print("There are", langchain_chroma_post._collection.count(), "in the collection.")
#print("There are", langchain_chroma._collection_posts.count(), "in the collection of posts.")
langchain_chroma_post._collection

There are 50 in the collection.


Collection(name=collection_posts)

In [101]:
# Test the langchain_chroma_post
query = "What it got said about Jonathan_Muha"
docs = langchain_chroma_post.similarity_search(query)
print(docs[0].page_content)

ValueError: You must provide embeddings or a function to compute them

In [35]:
# Create a database

# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")

# save to disk
db = Chroma.from_documents(docs,
                           embedding_function,
                           #ids=ids,
                           persist_directory=PERSIST_DIRECTORY)

# Test the db
query = "What it got said about Jonathan_Muha"
docs = db.similarity_search(query)
print(docs[0].page_content)

Have a safe day, @Jonathan_Muha!


## 3. Creating our self-querying retriever

Now we can instantiate our retriever. To do this we’ll need to provide some information upfront about the metadata fields that our documents support and a short description of the document contents.

In [18]:
# Metadata description
metadata_field_info = [
    AttributeInfo(
        name="stream_name",
        description="The stream where the comment was poste. One of ['Jobs','Water Break','Safety & Operations','OSC, We Can Help','Flagger Force Connect','The Whiteboard','Test Stream']",
        type="object",
    ),
    AttributeInfo(
        name="created_date",
        description="The datetime when the comment was posted",
        type="object",
    ),
    AttributeInfo(
        name="post_id",
        description="The id of the post where the comment was posted",
        type="int64",
    ),
    AttributeInfo(
        name="comment_id",
        description="The id of the comment",
        type="int64",
    ),
    AttributeInfo(
        name="like_count",
        description="The number of likes received by the comment",
        type="int64",        
    ),    
    AttributeInfo(
        name="report_count",
        description="The number of reports where the comment appears",
        type="int64",
    ),
    AttributeInfo(
        name="username",
        description="The username of the author of the comment",
        type="object",
    ),
    AttributeInfo(
        name="author_user_id",
        description="The id of the author of the comment",
        type="object",
    ),    
    AttributeInfo(
        name="author_position",
        description="The position of the author of the comment. One of ['Advanced Crew Leader','Crew Member','Crew Leader','Weekend Dispatch','Lead Instructor','Area Supervisor','Field Trainer 1','Warehouse Coordinator','Field Trainer 2','Executive Assistant','Internal Communications Manager','Field Manager','Safety Professional','Internal Communications Coordinator', 'Employee Services Supervisor','Safety Advocate', and many more...]",
        type="object",
    ),
    AttributeInfo(
        name="author_status",
        description="The position of the author of the comment. One of ['active','suspended']",
        type="object",
    ),
]

# Creating our self-querying retriever
document_content_description = "Comments on social network posts"
llm = ChatOllama(model="mistral")

retriever = SelfQueryRetriever.from_llm(    
    llm,
    db,
    document_content_description,    
    metadata_field_info,
)

### 3.1 Testing out the retriever

In [22]:
# This example only specifies a query
retriever.invoke("What are two comments about congratulations")

[Document(page_content='Congratulations', metadata={'author_position': 'Crew Leader', 'author_status': 'suspended', 'author_user_id': 'e62d292b-7a01-4aa9-ab5a-e6fbfaa6e326', 'comment_id': 3927754, 'created_date': '2022-01-26 00:17:59', 'like_count': 0, 'post_id': 4989194, 'report_count': 0, 'stream_name': 'Safety & Operations', 'username': 'Donnie_Ziegler'}),
 Document(page_content='Congratulations', metadata={'author_position': 'Advanced Crew Leader', 'author_status': 'active', 'author_user_id': '5a3ca39e-5c41-4501-8eab-6a9ed0a62044', 'comment_id': 4686674, 'created_date': '2022-08-14 02:01:54', 'like_count': 0, 'post_id': 5643777, 'report_count': 0, 'stream_name': 'Safety & Operations', 'username': 'William_MobleyJr'}),
 Document(page_content='Congratulations', metadata={'author_position': 'Crew Member', 'author_status': 'suspended', 'author_user_id': '74f383df-5aa7-4efb-af99-5f96405dadc8', 'comment_id': 5163635, 'created_date': '2022-12-17 00:01:52', 'like_count': 0, 'post_id': 6082

In [23]:
df_comments_to_test['username'].value_counts()

username
FlaggerForce        8
weekend_dispatch    8
Karen_Stroup        8
Linwood_DavisJr     6
Jessica_Beers       5
                   ..
Deon_McDaniel       1
Adrienne_Long       1
Stephen_Michael     1
Towanda_Gordon      1
Kyndra_Edwards      1
Name: count, Length: 201, dtype: int64

In [28]:
# This example only specifies a filter
retriever.invoke("What is a highly liked (above 4) comment")

[Document(page_content='What did the buffalo say to his son when he dropped him off at school?\nBison!!!! 😂🤣😭 @Jeffrey_Williams @Seth_Lee 😂🤣😭', metadata={'author_position': 'Field Trainer 1', 'author_status': 'suspended', 'author_user_id': 'f1555029-1597-4d3e-9199-4ebc04ed9702', 'comment_id': 4695288, 'created_date': '2022-08-16 19:19:26', 'like_count': 6, 'post_id': 5653172, 'report_count': 0, 'stream_name': 'Water Break', 'username': 'Collette_Monaghan'}),
 Document(page_content='As @Julie_Snedeker mentioned the focus here is to stop tampering with Flagger Forced issued equipment. The cameras believe it or not are in the vehicles for YOUR SAFETY. If you pick up the cell phone it says to put it down... if your following to close it says to back off... it is easy to sit and nitpick a photo but it is difficult to follow company policy.', metadata={'author_position': 'Safety Manager', 'author_status': 'active', 'author_user_id': '2aac35e8-f970-4434-9b46-56274384cf89', 'comment_id': 66010

In [27]:
# group by to find the comments with highest number of likes
df_comments_to_test.groupby('comment_id')['like_count'].sum().sort_values(ascending=False)

comment_id
6601009    6
4695288    6
5959837    4
3993841    4
6468179    4
          ..
5031138    0
5049534    0
5067432    0
5068273    0
6865894    0
Name: like_count, Length: 300, dtype: int64

In [None]:
# Deleting the client
import chromadb
from chromadb.config import Settings
PERSIST_DIRECTORY = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\datasets_db"

chromadb_client = chromadb.PersistentClient(path=PERSIST_DIRECTORY, settings=Settings(allow_reset=True))
chromadb_client.reset()

In [None]:
query = "Thank you Robert, I appreciate your knowledge"
docs = db.similarity_search(query)
print(docs[0].page_content)


In [None]:
import chromadb
from chromadb.config import Settings
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings


PERSIST_DIRECTORY = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\datasets_db"
# embeddings = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")
embeddings = OllamaEmbeddings(model="mistral") 

persistent_client = chromadb.PersistentClient(path=PERSIST_DIRECTORY, settings=Settings(allow_reset=True))
collection_comments = persistent_client.get_or_create_collection("collection_comments",
                                                        metadata={"hnsw:space": "l2"}, # Squared L2 norm(l2) is the default, inner product('ip') or cosine similarity('cosine')                                                              
                                                        embedding_function=embeddings
                                                        )
 # Add data to collection
collection_comments.add(documents=documents,
                        metadatas=metadatas,
                        ids=ids
                        )

langchain_chroma = Chroma(
    client=persistent_client,
    collection_name="collection_comments",
    embedding_function=embeddings
)

print("There are", langchain_chroma._collection.count(), "in the collection")

In [None]:
# Deleting the client
import chromadb
from chromadb.config import Settings
PERSIST_DIRECTORY = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\datasets_db"

chromadb_client = chromadb.PersistentClient(path=PERSIST_DIRECTORY, settings=Settings(allow_reset=True))
chromadb_client.reset()

In [None]:
import chromadb
#from langchain_chroma import Chroma
#from langchain_community.embeddings import OllamaEmbeddings
#import chromadb.utils.embedding_functions as embedding_functions
from chromadb.config import Settings
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma

      
PERSIST_DIRECTORY = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\datasets_db"
embeddings = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")    
    

# Function to create and load data to the database
"""
The embedding function takes text as input, and performs tokenization and embedding. If no embedding function is supplied, Chroma will use sentence transformer as a default.
https://docs.trychroma.com/embeddings#sentence-transformers
By default, Chroma uses all-MiniLM-L6-v2
"""


def upload_to_chromadb(documents, metadatas, ids):
    try:            
        # create the open-source embedding function
        # embedding = OllamaEmbeddings(model="mistral")
        # embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")   # Default
        embeddings = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")          # Best model from included in ChromaDB
        
        # Initialize ChromaDB client
        # load it into Chroma
        db = Chroma.from_documents(documents, embeddings,
                                    persist_directory=PERSIST_DIRECTORY)
               
        # Create a comment's collection
        collection_comments = chromadb_client.create_collection(name="comments_collection",
                                                              metadata={"hnsw:space": "l2"}, # Squared L2 norm(l2) is the default, inner product('ip') or cosine similarity('cosine')                                                              
                                                              embedding_function=embeddings
                                                              )        
        
        # Add data to collection
        collection_comments.add(documents=documents,
                                metadatas=metadatas,
                                ids=ids
                                )
        print("Data uploaded successfully.")
    except Exception as e:
        print(f"Failed to upload data: {e}")

upload_to_chromadb(documents, metadatas, ids)