In [38]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import google.generativeai as genai
from langchain_community.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
import re
from datetime import datetime

In [39]:
def parse_time(time_str):
    return datetime.strptime(time_str, '%H:%M:%S,%f').time()

def read_txt_subtitle(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        
    pattern = re.compile(r'(\d+)\n(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})\n(.+?)(?=\n\d|\Z)', re.DOTALL)
    matches = pattern.findall(content)
    
    subtitles = ""
    for match in matches:
        subtitle_data = {
            "index": int(match[0]),
            "start": parse_time(match[1]),
            "end": parse_time(match[2]),
            "text": match[3].strip().replace('\n', ' ')
        }
        subtitles += f"Index: {subtitle_data['index']}\n"
        subtitles += f"Start: {subtitle_data['start']}\n"
        subtitles += f"End: {subtitle_data['end']}\n"
        subtitles += f"Text: {subtitle_data['text']}\n\n"
    
    return subtitles

# Example usage
file_path = 'test.txt'
subtitles_string = read_txt_subtitle(file_path)

# Print the subtitles string to verify
print(subtitles_string)

Index: 1
Start: 00:00:00
End: 00:00:07.319000
Text: getting open welcome everybody

Index: 2
Start: 00:00:03.890000
End: 00:00:09.599000
Text: so we'll start with the apologies and on

Index: 3
Start: 00:00:07.319000
End: 00:00:11.190000
Text: guess we have hazel everyone else seems

Index: 4
Start: 00:00:09.599000
End: 00:00:14.370000
Text: to be here I think so could I little

Index: 5
Start: 00:00:11.190000
End: 00:00:17.400000
Text: mover please Andrew just just before you

Index: 6
Start: 00:00:14.370000
End: 00:00:19.529000
Text: did just before you do and I had to

Index: 7
Start: 00:00:17.400000
End: 00:00:21.330000
Text: leave the meeting at 1:30 to attend a

Index: 8
Start: 00:00:19.529000
End: 00:00:24.119000
Text: future-proof meeting so sorry about that

Index: 9
Start: 00:00:21.330000
End: 00:00:24.830000
Text: so my apologies from mean good as cold

Index: 10
Start: 00:00:24.119000
End: 00:00:27.269000
Text: thanks Jim

Index: 11
Start: 00:00:24.830000
End: 00:00:30.8400

In [63]:
def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=1000)
    chunks = text_splitter.split_text(text)
    return chunks

In [65]:
test = get_text_chunks(subtitles_string)
test

["Index: 1\nStart: 00:00:00\nEnd: 00:00:07.319000\nText: getting open welcome everybody\n\nIndex: 2\nStart: 00:00:03.890000\nEnd: 00:00:09.599000\nText: so we'll start with the apologies and on\n\nIndex: 3\nStart: 00:00:07.319000\nEnd: 00:00:11.190000\nText: guess we have hazel everyone else seems\n\nIndex: 4\nStart: 00:00:09.599000\nEnd: 00:00:14.370000\nText: to be here I think so could I little\n\nIndex: 5\nStart: 00:00:11.190000\nEnd: 00:00:17.400000\nText: mover please Andrew just just before you\n\nIndex: 6\nStart: 00:00:14.370000\nEnd: 00:00:19.529000\nText: did just before you do and I had to\n\nIndex: 7\nStart: 00:00:17.400000\nEnd: 00:00:21.330000\nText: leave the meeting at 1:30 to attend a\n\nIndex: 8\nStart: 00:00:19.529000\nEnd: 00:00:24.119000\nText: future-proof meeting so sorry about that\n\nIndex: 9\nStart: 00:00:21.330000\nEnd: 00:00:24.830000\nText: so my apologies from mean good as cold\n\nIndex: 10\nStart: 00:00:24.119000\nEnd: 00:00:27.269000\nText: thanks Jim\n\

In [66]:
embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")
vector_store = FAISS.from_texts(test, embedding=embeddings)
vector_store.save_local("faiss_text_to_note")

In [76]:
prompt_template = """
You are a highly intelligent and efficient meeting assistant. Your task is to transform detailed meeting transcripts into the text that the user is looking for.
Answer the question as detailed as possible from the provided context, make sure to provide all the details, do not include any form of human name or other name
Context:\n {context}?\n
Question: \n{question}\n

Answer:
"""

model = ChatGoogleGenerativeAI(model="gemini-1.5-flash",
                            temperature=0.3)

prompt = PromptTemplate(template = prompt_template, input_variables = ["context", "question"])
chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)

In [78]:
embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")
user_question = "What is discussed on 50 to 60 minutes?"

new_db = FAISS.load_local("faiss_text_to_note", embeddings, allow_dangerous_deserialization=True)
docs = new_db.similarity_search(user_question)



response = chain.invoke(
        {"input_documents": docs, "question": user_question},
        return_only_outputs=True
    )

print(response)

{'output_text': "The discussion between 50 to 60 minutes revolves around the financial performance of the company. The speaker discusses the current financial standing, including income, operating expenditure, assets, and capital expenditure. They also mention the impact of COVID-19 on the company's financial forecasts and the steps taken to address these challenges. \n"}
