# Artefact I - RAG Application just with Comments

## 1. Initial Setup

In [31]:
# Python 
import os
import pandas as pd

# LangChain 
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_community.chat_models import ChatOllama

# Set OpenAi key as an environment variable
os.environ["sk-proj-Sm1whIiewmhrMZpq7rlBT3BlbkFJQ5PTl7vQLyybJYSzeUFC"] = ""

In [32]:
# Import users file
#path = "/home/ubuntu/thesis_GenAI/data/production_datasets/cleaned_datasets"
path = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\production_datasets\\cleaned_datasets"
file_name = "comments_cleaned.csv"
file_path = os.path.join(path, file_name)
df_comments = pd.read_csv(file_path)

# Create a sample of 50,000 comments to test
df_comments_to_test = df_comments.sample(300)
# Print a sample
df_comments.sample(3)

Unnamed: 0,stream_name,created_date,post_id,comment_id,comment_text,like_count,report_count,username,author_user_id,author_position,author_status
85876,Jobs,2022-10-20 17:49:14,5888937,4939735,Me,0,0,Risa_Gross,efe02e94-0e60-462a-a619-a5bc67fc2da4,Advanced Crew Leader,suspended
65568,Jobs,2023-02-12 18:23:49,6250664,5368074,@Brady_Macklin,1,0,Ricky_Ellis,7f996efb-c280-4fbc-aed3-890599eba1e9,Advanced Crew Leader,suspended
15290,Jobs,2023-12-08 22:47:47,7319986,6529232,Refer to previous comment. Maybe she didn’t kn...,0,0,Megan_Urias,c590a2b1-9d02-442e-8a6a-fc89de0ae23a,Crew Member,active


## 2. Prepare Data for Embeddings

In [33]:
# Function to prepare the data to be loaded to the database
def prepare_data(df):   
    documents = []
    metadatas = []
    ids = []
        
    for idx, row in df.iterrows(): 
        documents.append(row.iloc[4])
        metadatas.append({
            'stream_name': row.iloc[0],
            'created_date': row.iloc[1],
            'post_id': row.iloc[2],
            'comment_id': row.iloc[3],
            'like_count': row.iloc[5],
            'report_count': row.iloc[6],
            'username': row.iloc[7],
            'author_user_id': row.iloc[8],
            'author_position': row.iloc[9],
            'author_status': row.iloc[10]
        })
        ids.append(str(idx+1))       
    
    return documents, metadatas, ids

documents, metadatas, ids = prepare_data(df_comments_to_test)

print(type(documents))
documents[:10]

<class 'list'>


['Congratulations 🎉 🥳',
 'I need a cm',
 'I can do it',
 'Yeah but where is his ppe lol 😆',
 'I just spoke with my as I believe they are sending over the crew that is on standby if we finish our current job we can go help as well we are a team here at ppl tannersville',
 'Do you not meet your crew lead there (1405 Bay Ridge)??',
 'Filled - thank you all',
 'Me',
 'Considering the space you have there, the shift looks good! I appreciate that you have the stop slow paddle available just in case it is needed. \n\n     Coachable moment: place the paddle in a safer location. It appears to be staged across the pedestrian path way. Maybe place it back towards the fence or near the trash can out of the way. \n   \n  Over all great job team. \n\nThe eagle is perched watching ove the site ready to act if needed. (Picture #4)',
 '@Lance_Fountain thank you! we have added you to the order, thank you everyone who volunteered!']

In [34]:
# Text splitter
PERSIST_DIRECTORY = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\datasets_db"

# split it into chunks
text_splitter = CharacterTextSplitter(    
    separator="\n\n",
    chunk_size=350,
    chunk_overlap=0,
    length_function=len,
    is_separator_regex=False,        
    )

docs = text_splitter.create_documents(documents, metadatas=metadatas)
docs[:2]  

[Document(page_content='Congratulations 🎉 🥳', metadata={'stream_name': 'Safety & Operations', 'created_date': '2022-12-02 15:39:40', 'post_id': 6032891, 'comment_id': 5102820, 'like_count': 0, 'report_count': 0, 'username': 'Maegan_Winters', 'author_user_id': '809b6f68-e69a-4217-84eb-30c53a08b05a', 'author_position': 'Crew Leader', 'author_status': 'active'}),
 Document(page_content='I need a cm', metadata={'stream_name': 'Jobs', 'created_date': '2022-01-21 13:47:25', 'post_id': 4978497, 'comment_id': 3914808, 'like_count': 0, 'report_count': 0, 'username': 'weekend_dispatch', 'author_user_id': 'cd381448-d11f-41b6-b15c-b2dd0a884bf5', 'author_position': 'Weekend Dispatch', 'author_status': 'active'})]

In [35]:
# Create a database

# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")

# save to disk
db = Chroma.from_documents(docs,
                           embedding_function,
                           #ids=ids,
                           persist_directory=PERSIST_DIRECTORY)

# Test the db
query = "What it got said about Jonathan_Muha"
docs = db.similarity_search(query)
print(docs[0].page_content)

Have a safe day, @Jonathan_Muha!


## 3. Creating our self-querying retriever

Now we can instantiate our retriever. To do this we’ll need to provide some information upfront about the metadata fields that our documents support and a short description of the document contents.

In [18]:
# Metadata description
metadata_field_info = [
    AttributeInfo(
        name="stream_name",
        description="The stream where the comment was poste. One of ['Jobs','Water Break','Safety & Operations','OSC, We Can Help','Flagger Force Connect','The Whiteboard','Test Stream']",
        type="object",
    ),
    AttributeInfo(
        name="created_date",
        description="The datetime when the comment was posted",
        type="object",
    ),
    AttributeInfo(
        name="post_id",
        description="The id of the post where the comment was posted",
        type="int64",
    ),
    AttributeInfo(
        name="comment_id",
        description="The id of the comment",
        type="int64",
    ),
    AttributeInfo(
        name="like_count",
        description="The number of likes received by the comment",
        type="int64",        
    ),    
    AttributeInfo(
        name="report_count",
        description="The number of reports where the comment appears",
        type="int64",
    ),
    AttributeInfo(
        name="username",
        description="The username of the author of the comment",
        type="object",
    ),
    AttributeInfo(
        name="author_user_id",
        description="The id of the author of the comment",
        type="object",
    ),    
    AttributeInfo(
        name="author_position",
        description="The position of the author of the comment. One of ['Advanced Crew Leader','Crew Member','Crew Leader','Weekend Dispatch','Lead Instructor','Area Supervisor','Field Trainer 1','Warehouse Coordinator','Field Trainer 2','Executive Assistant','Internal Communications Manager','Field Manager','Safety Professional','Internal Communications Coordinator', 'Employee Services Supervisor','Safety Advocate', and many more...]",
        type="object",
    ),
    AttributeInfo(
        name="author_status",
        description="The position of the author of the comment. One of ['active','suspended']",
        type="object",
    ),
]

# Creating our self-querying retriever
document_content_description = "Comments on social network posts"
llm = ChatOllama(model="mistral")

retriever = SelfQueryRetriever.from_llm(    
    llm,
    db,
    document_content_description,    
    metadata_field_info,
)

### 3.1 Testing out the retriever

In [22]:
# This example only specifies a query
retriever.invoke("What are two comments about congratulations")

[Document(page_content='Congratulations', metadata={'author_position': 'Crew Leader', 'author_status': 'suspended', 'author_user_id': 'e62d292b-7a01-4aa9-ab5a-e6fbfaa6e326', 'comment_id': 3927754, 'created_date': '2022-01-26 00:17:59', 'like_count': 0, 'post_id': 4989194, 'report_count': 0, 'stream_name': 'Safety & Operations', 'username': 'Donnie_Ziegler'}),
 Document(page_content='Congratulations', metadata={'author_position': 'Advanced Crew Leader', 'author_status': 'active', 'author_user_id': '5a3ca39e-5c41-4501-8eab-6a9ed0a62044', 'comment_id': 4686674, 'created_date': '2022-08-14 02:01:54', 'like_count': 0, 'post_id': 5643777, 'report_count': 0, 'stream_name': 'Safety & Operations', 'username': 'William_MobleyJr'}),
 Document(page_content='Congratulations', metadata={'author_position': 'Crew Member', 'author_status': 'suspended', 'author_user_id': '74f383df-5aa7-4efb-af99-5f96405dadc8', 'comment_id': 5163635, 'created_date': '2022-12-17 00:01:52', 'like_count': 0, 'post_id': 6082

In [23]:
df_comments_to_test['username'].value_counts()

username
FlaggerForce        8
weekend_dispatch    8
Karen_Stroup        8
Linwood_DavisJr     6
Jessica_Beers       5
                   ..
Deon_McDaniel       1
Adrienne_Long       1
Stephen_Michael     1
Towanda_Gordon      1
Kyndra_Edwards      1
Name: count, Length: 201, dtype: int64

In [28]:
# This example only specifies a filter
retriever.invoke("What is a highly liked (above 4) comment")

[Document(page_content='What did the buffalo say to his son when he dropped him off at school?\nBison!!!! 😂🤣😭 @Jeffrey_Williams @Seth_Lee 😂🤣😭', metadata={'author_position': 'Field Trainer 1', 'author_status': 'suspended', 'author_user_id': 'f1555029-1597-4d3e-9199-4ebc04ed9702', 'comment_id': 4695288, 'created_date': '2022-08-16 19:19:26', 'like_count': 6, 'post_id': 5653172, 'report_count': 0, 'stream_name': 'Water Break', 'username': 'Collette_Monaghan'}),
 Document(page_content='As @Julie_Snedeker mentioned the focus here is to stop tampering with Flagger Forced issued equipment. The cameras believe it or not are in the vehicles for YOUR SAFETY. If you pick up the cell phone it says to put it down... if your following to close it says to back off... it is easy to sit and nitpick a photo but it is difficult to follow company policy.', metadata={'author_position': 'Safety Manager', 'author_status': 'active', 'author_user_id': '2aac35e8-f970-4434-9b46-56274384cf89', 'comment_id': 66010

In [27]:
# group by to find the comments with highest number of likes
df_comments_to_test.groupby('comment_id')['like_count'].sum().sort_values(ascending=False)

comment_id
6601009    6
4695288    6
5959837    4
3993841    4
6468179    4
          ..
5031138    0
5049534    0
5067432    0
5068273    0
6865894    0
Name: like_count, Length: 300, dtype: int64

In [None]:
# Deleting the client
import chromadb
from chromadb.config import Settings
PERSIST_DIRECTORY = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\datasets_db"

chromadb_client = chromadb.PersistentClient(path=PERSIST_DIRECTORY, settings=Settings(allow_reset=True))
chromadb_client.reset()

In [None]:
query = "Thank you Robert, I appreciate your knowledge"
docs = db.similarity_search(query)
print(docs[0].page_content)


# Artefact II - RAG using Two Collections

In [None]:
import chromadb
from chromadb.config import Settings
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings


PERSIST_DIRECTORY = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\datasets_db"
# embeddings = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")
embeddings = OllamaEmbeddings(model="mistral") 

persistent_client = chromadb.PersistentClient(path=PERSIST_DIRECTORY, settings=Settings(allow_reset=True))
collection_comments = persistent_client.get_or_create_collection("collection_comments",
                                                        metadata={"hnsw:space": "l2"}, # Squared L2 norm(l2) is the default, inner product('ip') or cosine similarity('cosine')                                                              
                                                        embedding_function=embeddings
                                                        )
 # Add data to collection
collection_comments.add(documents=documents,
                        metadatas=metadatas,
                        ids=ids
                        )

langchain_chroma = Chroma(
    client=persistent_client,
    collection_name="collection_comments",
    embedding_function=embeddings
)

print("There are", langchain_chroma._collection.count(), "in the collection")

In [None]:
# Deleting the client
import chromadb
from chromadb.config import Settings
PERSIST_DIRECTORY = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\datasets_db"

chromadb_client = chromadb.PersistentClient(path=PERSIST_DIRECTORY, settings=Settings(allow_reset=True))
chromadb_client.reset()

In [None]:
import chromadb
#from langchain_chroma import Chroma
#from langchain_community.embeddings import OllamaEmbeddings
#import chromadb.utils.embedding_functions as embedding_functions
from chromadb.config import Settings
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma

      
PERSIST_DIRECTORY = "C:\\Users\\eduar\\Documents\\Master_Thesis\\GenAI_Thesis_Beekeeper\\data\\datasets_db"
embeddings = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")    
    

# Function to create and load data to the database
"""
The embedding function takes text as input, and performs tokenization and embedding. If no embedding function is supplied, Chroma will use sentence transformer as a default.
https://docs.trychroma.com/embeddings#sentence-transformers
By default, Chroma uses all-MiniLM-L6-v2
"""


def upload_to_chromadb(documents, metadatas, ids):
    try:            
        # create the open-source embedding function
        # embedding = OllamaEmbeddings(model="mistral")
        # embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")   # Default
        embeddings = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")          # Best model from included in ChromaDB
        
        # Initialize ChromaDB client
        # load it into Chroma
        db = Chroma.from_documents(documents, embeddings,
                                    persist_directory=PERSIST_DIRECTORY)
               
        # Create a comment's collection
        collection_comments = chromadb_client.create_collection(name="comments_collection",
                                                              metadata={"hnsw:space": "l2"}, # Squared L2 norm(l2) is the default, inner product('ip') or cosine similarity('cosine')                                                              
                                                              embedding_function=embeddings
                                                              )        
        
        # Add data to collection
        collection_comments.add(documents=documents,
                                metadatas=metadatas,
                                ids=ids
                                )
        print("Data uploaded successfully.")
    except Exception as e:
        print(f"Failed to upload data: {e}")

upload_to_chromadb(documents, metadatas, ids)