In [None]:
#importing all packages

import pandas as pd #to make csv file into table interpreted by Python
import os #to traverse folders and file system


from sentence_transformers import SentenceTransformer #for vector embedding creation
from langchain_community.embeddings import SentenceTransformerEmbeddings
import time
import nltk #for natural lang processing
from nltk.tokenize import sent_tokenize #tokenises string by sentence
from langchain_community.vectorstores import FAISS #for vector db
from langchain_core.documents import Document #document is a distinct piece of text

In [None]:
# Combining transcripts from all video ids to make one final string to be embedded. Completed by traversing all .txt files from 
# main transcripts folder

def get_combined_content(csv_file, folder_path):
    """
    Combines transcript texts from all .txt files in the Main Transcripts folder.

    Args:
        csv_file (String): The csv path.
        folder_path (String): The folder path.

    Returns:
        String: Combined transcript text.
    """
    input_data = pd.read_csv(csv_file)
    video_ids = input_data['Video_ID'].tolist() #creates list of video ids
    all_content = ""

    for video_id in video_ids:
        txt_file_path = os.path.join(folder_path, f'{video_id}.txt')
        
        if os.path.isfile(txt_file_path):
            with open(txt_file_path, 'r') as file:
                content = file.read() #reading file
                all_content += content + "\n" #appending the final string
        else:
            print(f"File {txt_file_path} does not exist.")
    
    print("Task completed.")
    
    return all_content


In [None]:
#functions for chunking and adding chunks for embedding

nltk.download('punkt') #module in nltk

def chunk_text(text, chunk_size):
    #Chunks text into smaller pieces of length chunk_size
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)] 

def process_sentences(sentences, model, chunk_size=256):
    #Processes sentences by chunking and encoding.
    embeddings = []
    chunks_list = []
    for sentence in sentences:
        chunks = chunk_text(sentence, chunk_size)
        chunk_embeddings = model.encode(chunks)  # Use the model from sentence-transformers
        embeddings.extend(chunk_embeddings) #adds embeddings
        chunks_list.extend(chunks) #adds chunks
    return chunks_list, embeddings


In [None]:
#using all-MiniLM-L6-v2 model and using above functions

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

start_time = time.time() #start computation time

text = get_combined_content("mkbdh_information.csv","Main_Transcripts") #getting final string
sentences = sent_tokenize(text)

chunks, embeddings = process_sentences(sentences, model) #getting list of chunks and embeddings

# Creation of embeddings.txt in 'write mode' to store all embeddings
with open('embeddings.txt', 'w') as f:
    for i, embedding in enumerate(embeddings): #looping and displaying embedding list for #chunk in each line
        f.write(f"Embedding for chunk {i+1}: {embedding.tolist()}\n") #writing it to text file

documents = [Document(page_content=chunk) for chunk in chunks] #document object creation where a document is recognised by a chunk
print(documents)

print("Process finished --- %s seconds ---" % (time.time() - start_time))


In [None]:

from langchain_community.vectorstores import FAISS #for similarity search


# Create text-embedding pairs where each chunk and its embedding is a pair
text_embedding_pairs = [(chunk, embedding) for chunk, embedding in zip(chunks, embeddings)]

# Create vector_db
vector_db = FAISS.from_embeddings(text_embeddings=text_embedding_pairs, embedding=model)

# Save the vector_db as 'vector_store'
vector_db.save_local(folder_path="vector_store", index_name="index")

tensor_size = len(embeddings)
print(tensor_size)
print("Vector db created.")


In [None]:

def load_faiss_index(index_path: str,emb_model) -> FAISS:
    model_kwargs = {'device':'cpu'} #keyword arguments
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    encode_kwargs = {'normalize_embeddings': False} #disable scaling
    embeddings = SentenceTransformerEmbeddings(
    model_name=model,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)
    return FAISS.load_local(index_path, embeddings=embeddings, allow_dangerous_deserialization=True) #bypass safety check
# def load_faiss_index(index_path: str,emb_model) -> FAISS:
#     model_kwargs = {'device':'cpu'}
#     encode_kwargs = {'normalize_embeddings': False}
#     embeddings = HuggingFaceEmbeddings(
#     model_name=emb_model,
#     model_kwargs=model_kwargs,
#     encode_kwargs=encode_kwargs
# )   
#     return FAISS.load_local(index_path, embeddings=embeddings, allow_dangerous_deserialization=True) #bypass safety check
def load_faiss_vectordb(db,emb_model):
    faiss = load_faiss_index(db,emb_model)
    return faiss

In [None]:
message_type_response = { #how to handle different types of queries when entered into chatgpt/gemini
    "Greetings Message" : "Greet the user by saying Hi or Hello",
    "Appreciation/Feedback Messages" : "Reply with thank you or similar response",
    "Questions Asked About the Content" : "Refer the context provided below",
    "Questions Asked Out of Context but Relevant to the Influencer" : "Refer online material and generate response",
    "Questions Asked but Irrelevant to the Influencer": "Don't answer",
    "Spam Messages": "Don't answer"
}

In [None]:
from langchain_core.document_loaders.base import Document
from typing import List

#start of RAG stage

#function for similarity search using an arbitrary number of top documents(5) and take query as a string and return a list of docs

def similarity_search(faiss_index: FAISS, query: str, k: int = 5) -> List[Document]:
    docs = faiss_index.similarity_search(query, k=k)
    return docs

In [None]:
#library that converts text to speech
import pyttsx3
def text_to_speech(text, filename): #takes in text and name of file to save speech in
    text_speech = pyttsx3.init()
    voices = text_speech.getProperty('voices') #retrives list of voices
    text_speech.setProperty('voice', voices[0].id) #chooses a particular voice
    text_speech.save_to_file(text, filename) #saves speech as .wav file
    text_speech.runAndWait() #execution

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI #importing gemini model from google 

def load_model():
    llm = ChatGoogleGenerativeAI(model="gemini-pro",api_key="MY_API_KEY") #loads model with my own api key
    return llm

In [None]:
def get_retrieval_results(context: List[Document], point: str) -> str:
    context = [doc.page_content for doc in context] #retrive context for query
    context = " ".join(context)
    # creation of prompt that is relevant
    #263 tokens
    prompt =  f''' Assume you are a Human tech content creator named MKBHD/ Marques Brownlee and you receive different comments and queries. 
                    Ensure to give response such that it looks like a human has answered it and not a bot.
                    Read the following text based on the following query and give response. Different types of messages are provided here as key of the dictionary and the expected response as respective value. Give the response accordingly.
                    In case, the query is asking response for a specific timeline, the context provided below has upload date of the data (release date of device would also be around it), refer it while giving response. Don't give old devices as response.
                    Message_Response: {message_type_response}            
                    Query is: {point}
                    Text: {context}
                    When the query is related to the context provided, don't give any irrelevant information. In other cases, refer the Message_Response dictionary and generate the response by following those guidelines; don't give the value directly. 
                    
                    More guidelines:
                    1. Never mention in the response that the context is being refered, and give the response directly.
                    2. Avoid including feel free to ask in the response when there is an actual response to the query. 
                    3. Avoid greetings if the query doesn't consist of only greetings.
                    4. Understand the language and words chosen by the content creator in the context provided and ensure that the response follows it.
                '''
    # 210 tokens
    # prompt =  f''' Assume you are a human tech content creator who receives various comments and queries 
    #                 Read the text {context} based on the query {point} and ensure to respond like a human, not a bot
    #                 Different types of messages are provided here as keys of the dictionary with the expected response as the respective value.Respond accordingly
    #                 If the query asks for a response related to a specific timeline, refer to the provided upload date (which is around the device's release date) when responding, avoid mentioning old devices

    #                 When the query relates to the provided context, avoid giving irrelevant information. For other cases, use the Message_Response {message_type_response} dictionary to generate the response according to the guidelines; don’t provide the value directly
                    
    #                 More guidelines:
    #                 - You are MKBHD or Marques Brownlee
    #                 - Never mention that the context is being referenced; provide the response directly
    #                 - Address the query directly, avoid "feel free to ask" in responses
    #                 - Avoid greetings unless the query consists solely of greetings
    #                 - Understand the language and tone used by the content creator in the provided context and ensure your response matches it
    #             '''
    
    model = load_model()
    response = model.invoke(prompt) #generation of prompt
    return response.content

In [None]:
#access embeddings and use them
from langchain.embeddings import HuggingFaceEmbeddings
def load_faiss_index(index_path: str, model) -> FAISS:
    model_kwargs = {'device': 'cpu'}
    embeddings = HuggingFaceEmbeddings(model_name=model, model_kwargs=model_kwargs)
    return FAISS.load_local(index_path, embeddings=embeddings, allow_dangerous_deserialization=True)

def load_faiss_vectordb(db, emb_model):
    faiss = load_faiss_index(db, emb_model)
    return faiss

faiss_index = load_faiss_vectordb('vector_store', 'BAAI/bge-large-en-v1.5') #loading faiss index from 'vector_db'

In [None]:
# Get query from user - single response
query = input("Please enter your query: ") 
print(query)

docs = similarity_search(faiss_index, query, 5)
result = get_retrieval_results(context=docs, point=query) #generates result from retrieved doc
print("Results: ", result)

# Get query from user - multi-response
# while True:
#     query = input("Please enter your query: ")
#     print("User: ", query)
#     docs = similarity_search(faiss_index, query, 5)
#     result = get_retrieval_results(context=docs, point=query) # generates result from retrieved doc
#     print("MKBHD: ", result)
    
#     if query.lower() == "thank you mkbhd":
#         print("You're welcome! Goodbye!")
#         break

    
    



In [None]:
docs #returns list of relevant docs

In [None]:
filename = "output.wav"
text_to_speech(result, filename)

## Notes

- EXTEND FOR MORE EMBEDDINGS WITHOUT REGENERATION EVERYTIME
- LEARN FAISS INTERNAL MECHANISM
- PROMPT ENGINEERING FOR QUALITY RESPONSE AND TO USE OPENAI IN LESS TOKENS(IMP)
- CREATING MORE TEST QUERIES FOR ACCURACY TESTING (IMP)
- CREATE CHAT INTERFACE OR SYSTEM TO USE USER RESPONSE AS CONTEXT FOR NEW RESPONSE

