In [None]:
from src.utils import utils
import json
import os 
import pandas as pd 

In [7]:
questions_list = [
  "What are the main objectives and focus areas of the meeting?",
  "Who are the participants, and what are their roles?",
  "What are the latest updates on assigned tasks and significant developments?",
  "What previous action items are still pending, and what challenges were faced?",
  "What major topics need to be discussed, and what insights have been shared?",
  "What challenges are currently being faced, and what solutions have been proposed?",
  "What key decisions need to be made, and what are their implications?",
  "What are the next steps, assigned tasks, and deadlines?",
  "What follow-up actions and plans are required for the next meeting?"
]
len(questions_list)

9

# Create collection

In [32]:
from src.utils.embedding import VectorDB
file_config = utils.load_config("src/config/file_config.yml")
model_config = utils.load_config(file_config["llm_env"]["model_config_file"])
vectordb = VectorDB(model_config)
collection_id = 'LangChain_agd_7bb158e64c93bea491df09894psd'
collection_id, text_key = vectordb.create_collection(collection_id)
print(collection_id)


Weaviate Client connect with ANONYMOUS
------VectorDB client: <weaviate.client.WeaviateClient object at 0x7fe2b810dcc0>
LangChain_agd_7bb158e64c93bea491df09894psd


# Create Data for EMBEDDING

In [8]:
root = "/datadrive/CuongHV/project/DATA/AMI_MS"
files = os.listdir(root)
data = []
for file in files:
    print(f"process {file}")
    # load json file
    path = f'{root}/{file}'
    with open(path, encoding="utf8") as f:
        jsondict = json.load(f)
    # Transcripts
    transcripts_cleaned = utils.clean_text(jsondict['transcript'])

    # Summary
    summary_cleaned = utils.clean_text(jsondict['summary'])

    # Shared doc
        # Extract text from txt, ppt, and doc
    try:
        txt = []
        for x in jsondict['shared-doc']['txt']:
            filename = x['filename']
            content = x['content']
            content_clean = utils.clean_text(content)
            txt_item = {
                'filename': filename,
                'content': content_clean,
            }
            txt.append(txt_item)
    except Exception as e:
        print(f"Error processing txt: {e}")
        txt = []
    try:
        ppt = []
        for x in jsondict['shared-doc']['ppt']:
            filename = x['filename']
            string = ['\n'.join(v) for k,v in x['content'].items()]
            content = '\n'.join(string)
            content_clean = utils.clean_text(content)
            ppt_item = {
                'filename': filename,
                'content': content_clean,
            }
            ppt.append(ppt_item)
    except Exception as e:
        print(f"Error processing ppt: {e}")
        ppt_content = []
    try:
        doc = []
        for x in jsondict['shared-doc']['doc']:
            filename = x['filename']
            content = x['content']
            content_clean = utils.clean_text(content)
            doc_item = {
                'filename': filename,
                'content': content_clean,
            }
            doc.append(doc_item)
    except Exception as e:
        print(f"Error processing doc: {e}")
        doc_content = []

    # Save as cleaned json
    cleaned_jsondict = {
        "transcript": transcripts_cleaned,
        "summary": summary_cleaned,
        "actions": jsondict['actions'],
        "decisions": jsondict['decisions'],
        "problems": jsondict['problems'],
        "shared-doc": {
            "txt": txt,
            "ppt": ppt,
            "doc": doc
        },
    }
    with open(f"/datadrive/CuongHV/project/DATA/AMI_MS_RAG_Cleaned/{file}", "w", encoding="utf8") as f:
        json.dump(cleaned_jsondict, f, ensure_ascii=False, indent=4)
    print(f'{file} done')


process ES2004d.json
ES2004d.json done
process ES2008b.json
ES2008b.json done
process IS1000b.json
Error processing txt: 'NoneType' object is not iterable
Error processing ppt: 'NoneType' object is not iterable
Error processing doc: 'NoneType' object is not iterable
IS1000b.json done
process IS1004d.json
IS1004d.json done
process IS1008a.json
IS1008a.json done
process ES2015b.json
Error processing txt: 'NoneType' object is not iterable
Error processing ppt: 'NoneType' object is not iterable
Error processing doc: 'NoneType' object is not iterable
ES2015b.json done
process IS1004a.json
IS1004a.json done
process TS3011b.json
Error processing txt: 'NoneType' object is not iterable
Error processing ppt: 'NoneType' object is not iterable
Error processing doc: 'NoneType' object is not iterable
TS3011b.json done
process ES2015d.json
Error processing txt: 'NoneType' object is not iterable
Error processing ppt: 'NoneType' object is not iterable
Error processing doc: 'NoneType' object is not iter

# EMBEDDING

In [None]:
from src.utils.embedding import VectorDB
file_config = utils.load_config("src/config/file_config.yml")
model_config = utils.load_config(file_config["llm_env"]["model_config_file"])
vectordb = VectorDB(model_config)


In [17]:
file_config = utils.load_config("src/config/file_config.yml")
model_config = utils.load_config(file_config["llm_env"]["model_config_file"])
vectordb = VectorDB(model_config)

Weaviate Client connect with ANONYMOUS
------VectorDB client: <weaviate.client.WeaviateClient object at 0x7fe3213ee740>


In [11]:
df_shared_docs = utils.load_data_with_shared_doc_path()
df_shared_docs['file'].values[:4]

array(['ES2004d.json', 'ES2008b.json', 'IS1004d.json', 'IS1008a.json'],
      dtype=object)

In [15]:
root = '/datadrive/CuongHV/project/DATA/AMI_MS_RAG_Cleaned'
sample_jsondict = utils.extract_data_from_file(file='ES2004d.json', root=root)
sample_jsondict['shared-doc']

{'txt': [{'filename': 'ES2004docs.ES2004scen.Summaries.P1Sum1.txt',
   'content': 'Received: from [139.222.1.2] by messenger\n(ArGoSoft Mail Server Freeware, Version 1.8 (1.8.6.4)); Wed, 19 Jan 2005 11:48:31 +0000\nMessage-ID: <001b01c4fe1c$f97d62e0$0201de8b@participant1>\nFrom: "participant1" <participant1@ami>\nTo: <HeadOfDept@ami>\nReferences: <01c4fe19$Blat.v2.2.2$6084975a@messenger>\nSubject: Re: Please summarise the last meeting\nDate: Wed, 19 Jan 2005 11:49:45 -0000\nMIME-Version: 1.0\nContent-Type: multipart/alternative;\n\tboundary="----=_NextPart_000_0018_01C4FE1C.F88D84A0"\nX-Priority: 3\nX-MSMail-Priority: Normal\nX-Mailer: Microsoft Outlook Express 6.00.2900.2180\nX-MimeOLE: Produced By Microsoft MimeOLE V6.00.2900.2180\nThis is a multi-part message in MIME format.\n------=_NextPart_000_0018_01C4FE1C.F88D84A0\nContent-Type: text/plain;\n\tcharset="iso-8859-1"\nContent-Transfer-Encoding: quoted-printable\nFor the documentation of the design project, you and all of your =\nc

In [25]:
# Embedding and create a tracking file 
df_shared_docs = utils.load_data_with_shared_doc_path()
root = '/datadrive/CuongHV/project/DATA/AMI_MS_RAG_Cleaned'
data = []
for file in df_shared_docs['file'].values:
    print(file)
    collection_id, tenant_id = vectordb.create_tenant()
    data.append({
        'file': file,
        'tenant_id' : tenant_id,
    })
    sample_jsondict = utils.extract_data_from_file(file=file, root=root)
    # Embedding shared-docs
    for type, list_doc in sample_jsondict['shared-doc'].items():
        for x in list_doc:
            filename = x['filename']
            content = x['content']
            index, tenant_name, text_key, documents = vectordb.import_data_to_db(
                meta_data={'filename':filename}, 
                page_content=content,
                index_name=collection_id,
                tenant_name=tenant_id,
            )
    

ES2004d.json
ES2008b.json
IS1004d.json
IS1008a.json
IS1004a.json
IS1005b.json
ES2008c.json
ES2008d.json
IS1006d.json
IS1006c.json
IS1005a.json
ES2006d.json
ES2004b.json
ES2009d.json
IS1005c.json
ES2005a.json
ES2006b.json
IS1001c.json
ES2003a.json
IS1001d.json
IS1001a.json
ES2002b.json
ES2004c.json
ES2002a.json
ES2003c.json
ES2008a.json
ES2006c.json
ES2007d.json
IS1009d.json
IS1009b.json
ES2005d.json
IS1006b.json
ES2009b.json
ES2010d.json
ES2002d.json
IS1001b.json
IS1003b.json
ES2007b.json
IS1004c.json
IS1003d.json
ES2002c.json
IS1008b.json
ES2005b.json
ES2009a.json
IS1003a.json
ES2010b.json
IS1009a.json
ES2007c.json
IS1007b.json
IS1006a.json
ES2007a.json
IS1008d.json
ES2010c.json
IS1007c.json
ES2005c.json
IS1008c.json
IS1007d.json
IS1007a.json
ES2006a.json
ES2009c.json
ES2004a.json
ES2003b.json
ES2010a.json
IS1009c.json
IS1003c.json
ES2003d.json
IS1004b.json


In [28]:
data

[{'file': 'ES2004d.json',
  'tenant_id': 'deploy_mind_81d33437db5d446ab045abac31d33c3b'},
 {'file': 'ES2008b.json',
  'tenant_id': 'deploy_mind_3ea93cd23e3842cc917060e649e8985b'},
 {'file': 'IS1004d.json',
  'tenant_id': 'deploy_mind_09280add1a10419c8919f5dd44f370d3'},
 {'file': 'IS1008a.json',
  'tenant_id': 'deploy_mind_8d5bd42fd4d54281ae71dba89cf81719'},
 {'file': 'IS1004a.json',
  'tenant_id': 'deploy_mind_a77940965a4c4faaae895ee479b43a5c'},
 {'file': 'IS1005b.json',
  'tenant_id': 'deploy_mind_7d96a11ac37c475386d6f92376405757'},
 {'file': 'ES2008c.json',
  'tenant_id': 'deploy_mind_5acd076cdfa1423eaa7c12d10b2bb57e'},
 {'file': 'ES2008d.json',
  'tenant_id': 'deploy_mind_252aa7441aae44cf82627066c3dcea4c'},
 {'file': 'IS1006d.json',
  'tenant_id': 'deploy_mind_017d7960aac74e198b57424574b202c2'},
 {'file': 'IS1006c.json',
  'tenant_id': 'deploy_mind_6fe34b555a0a4c4b9a265bd1611b7c65'},
 {'file': 'IS1005a.json',
  'tenant_id': 'deploy_mind_101b06c616e9460f8f709501d79bcc64'},
 {'file': 

In [None]:
columns = ['file', 'tenant_id']
df =pd.DataFrame(data)
df.to_csv("rag_monitoring/tenant_id.csv", mode="w", header=True, index=False,columns=columns)

# Generate Answer 

In [25]:
# Read tenant_id 
import pandas as pd 
path = '/datadrive/CuongHV/project/mm_agenda_generation_research/rag_monitoring/tenant_id.csv'
data = pd.read_csv(path)
data.head()

Unnamed: 0,file,tenant_id
0,ES2004d.json,deploy_mind_81d33437db5d446ab045abac31d33c3b
1,ES2008b.json,deploy_mind_3ea93cd23e3842cc917060e649e8985b
2,IS1004d.json,deploy_mind_09280add1a10419c8919f5dd44f370d3
3,IS1008a.json,deploy_mind_8d5bd42fd4d54281ae71dba89cf81719
4,IS1004a.json,deploy_mind_a77940965a4c4faaae895ee479b43a5c


In [26]:
id = data['tenant_id'][0]
id

'deploy_mind_81d33437db5d446ab045abac31d33c3b'

In [27]:
from dotenv import load_dotenv
load_dotenv("/datadrive/CuongHV/project/mm_agenda_generation_research/.env")


from src.utils import utils
from langchain_core.tracers.context import tracing_v2_enabled
from src.config.llm_config import llm_config

file_config = utils.load_config("src/config/file_config.yml")
model_config = utils.load_config(file_config["llm_env"]["model_config_file"])
prompt_config = utils.load_config(file_config["llm_env"]["prompting_file"])
config_llm = llm_config(prompt_config, model_config)


Loading config from file_config


In [28]:
import os 
config = config_llm.chose_llm_model(use_redis=False)
# config = config_llm.chose_llm_embedding()
config = config_llm.chose_llm_embedding(
    llm_model = 'OpenAI',
    model = 'text-embedding-3-small'
)
INDEX = os.environ.get("COLLECTION_ID")
config = config_llm.config_db(index=INDEX, text_key='text', tenant_name=id)
config

{'knowledge_configure': {'knowledge_driver': src.driver.weaviatedb.WeaviateDB},
 'history_store': {'history_driver': src.driver.redisdb.RedisDB},
 'condense_question_configure': {'llm_core': langchain_openai.chat_models.base.ChatOpenAI,
  'llm_core_params': {'temperature': 0, 'model': 'gpt-4o-mini'},
  'prompt_core_template': 'Given a chat history and the latest user question which might reference context in the chat history, \nformulate a standalone question which can be understood without the chat history. \nDo NOT answer the question, just reformulate it if needed and otherwise return it as is.\n',
  'index_db': 'LangChain_agd_7bb158e64c93bea491df09894psd',
  'text_key': 'text',
  'tenant_name': 'deploy_mind_81d33437db5d446ab045abac31d33c3b'},
 'combine_docs_configure': {'llm_core': langchain_openai.chat_models.base.ChatOpenAI,
  'llm_core_params': {'temperature': 0, 'model': 'gpt-4o-mini'},
  'prompt_core_template': "You are an insightful assistant, providing tailored responses bas

In [29]:
from langchain_weaviate.vectorstores import WeaviateVectorStore as WeaviateLC
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
load_dotenv(".env")
import weaviate
client = weaviate.connect_to_custom(
    http_host= os.environ.get("WEAVIATE_HOST"), #"10.100.224.34",  # URL only, no http prefix
    http_port= os.environ.get("WEAVIATE_HOST_PORT"), #"8080",
    http_secure=False ,   # Set to True if https
    grpc_host= os.environ.get("WEAVIATE_GPC_URL"),#  "10.100.224.34",
    grpc_port= os.environ.get("WEAVIATE_GPC_URL_PORT"),#"50051",      # Default is 50051, WCD uses 443
    grpc_secure=False,   # Edit as needed
    skip_init_checks=True,
)
tenant_name = id
text_key = 'text'
k =2
llm_embedding = config['embedding_configure']['embedding']
vectorstore = WeaviateLC(
                client=client,
                index_name=INDEX,
                text_key='text',
                embedding=llm_embedding,
                use_multi_tenancy=True,
            )


            Please make sure to close the connection using `client.close()`.


In [30]:
query =  "What are the main objectives and focus areas of the meeting?"
retrivee_docs = vectorstore.similarity_search(query, tenant=id, k=5)
retrivee_docs

[Document(metadata={'filename': 'ES2004docs.ES2004scen.participant3hBcQuestionnairePre.doc'}, page_content='16. Do you feel that the objectives for your meetings are generally attained?\nNever\nHardly ever\nSometimes\nMost of the times*\nAlways\n\n17. Do you feel that the time for your meetings is generally well-spent?\nNever\nHardly ever\nSometimes\nMost of the times*\nAlways\n\n18. Do you generally like to participate in your meetings?\nNever\nHardly ever\nSometimes\nMost of the times*\nAlways\n\n19. Which of the following do you make use of before a meeting (to prepare for the meeting)?\nMinutes of the previous meeting(s)\nRelated documents\nAgenda*\nPersonal recollection\nContact other participants*\nPersonal notes of the previous meeting(s)*\nMeans to prepare a presentation*\nPictures of previous meeting(s)\nAudio recording of previous meeting(s)\nVideo recording of previous meeting(s)\nConsult external information sources (e.g. internet)*\nContact external people (face-to-face, e

In [None]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.prompts import (
    ChatPromptTemplate,
    PromptTemplate
)
from langchain_weaviate.vectorstores import WeaviateVectorStore as WeaviateLC

# Combine docs
llm_core = config['combine_docs_configure']['llm_core']
llm_core_params = config['combine_docs_configure']['llm_core_params']
prompt_core_template = config['combine_docs_configure']['prompt_core_template']
llm = llm_core(**llm_core_params)
knowledge_driver = config['knowledge_configure']['knowledge_driver']

qa_prompt = ChatPromptTemplate.from_messages(
                [
                    ("system", prompt_core_template),
                    ("human", "{input}"),
                ]
            )
customDocumentPrompt_template= """
Context: |
    {page_content}
"""
customDocumentPrompt = PromptTemplate(
                    input_variables=['page_content', 'source'],
                    template=customDocumentPrompt_template,
                )
question_answer_chain = create_stuff_documents_chain(
    llm=llm, 
    prompt = qa_prompt,
    document_prompt = customDocumentPrompt,
    document_variable_name = "context",
)
        


In [32]:
question= "What are the main objectives and focus areas of the meeting?"
with tracing_v2_enabled(project_name = "generated_rag_multi_input_agenda"):
    output = question_answer_chain.invoke({
        'input': question,
        'context': retrivee_docs,
    })
print(output)

The main objectives and focus areas of the meeting can be summarized as follows:

1. **Design Decisions**: A significant portion of the meeting was dedicated to discussing both internal and external design elements of a project. This includes decisions regarding the physical design aspects, such as materials and features of a product (e.g., a phone).

2. **Target Audience Identification**: The meeting involved identifying and discussing the target audiences that the project aims to appeal to, which is crucial for tailoring marketing strategies and design features.

3. **Marketing Aims**: There was a focus on external design issues that relate to marketing objectives, ensuring that the product aligns with market needs and consumer preferences.

4. **Task Allocation and Fulfillment**: The meetings also served as a platform for brainstorming, task allocation, and ensuring that team members are clear on their responsibilities related to the project.

5. **Problem Solving**: Addressing any 

The main objectives and focus areas of the meeting included:

1. **Introductions**: Participants shared their favorite animals through a drawing exercise.
2. **Project Overview**: Discussion of major project areas such as Industrial Design, User-Interaction Design, and Marketing.
3. **Group Discussion**: Focus on user-friendly design issues, particularly exploring mobile phone designs for adaptation in TV remote controls.
4. **Decision Making**: Reaching consensus on specifications and design elements for the Remote Control device.

# Generate the answers for 67 meetings

In [1]:
from dotenv import load_dotenv
load_dotenv("/datadrive/CuongHV/project/mm_agenda_generation_research/.env")


from src.utils import utils
from langchain_core.tracers.context import tracing_v2_enabled
from src.config.llm_config import llm_config
import os 

file_config = utils.load_config("src/config/file_config.yml")
model_config = utils.load_config(file_config["llm_env"]["model_config_file"])
prompt_config = utils.load_config(file_config["llm_env"]["prompting_file"])
config_llm = llm_config(prompt_config, model_config)
config = config_llm.chose_llm_model(use_redis=False)
# config = config_llm.chose_llm_embedding()
config = config_llm.chose_llm_embedding(
    llm_model = 'OpenAI',
    model = 'text-embedding-3-small'
)
INDEX = os.environ.get("COLLECTION_ID")
config = config_llm.config_db(index=INDEX, text_key='text', tenant_name=id)


Loading config from file_config


In [2]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.prompts import (
    ChatPromptTemplate,
    PromptTemplate
)
from langchain_weaviate.vectorstores import WeaviateVectorStore as WeaviateLC

# Combine docs
llm_core = config['combine_docs_configure']['llm_core']
llm_core_params = config['combine_docs_configure']['llm_core_params']
prompt_core_template = config['combine_docs_configure']['prompt_core_template']
llm = llm_core(**llm_core_params)
llm_embedding = config['embedding_configure']['embedding']

qa_prompt = ChatPromptTemplate.from_messages(
                [
                    ("system", prompt_core_template),
                    ("human", "{input}"),
                ]
            )
customDocumentPrompt_template= """
Context: |
    {page_content}
"""
customDocumentPrompt = PromptTemplate(
                    input_variables=['page_content', 'filename'], #Filename use for the production, retrieve the evident for user
                    template=customDocumentPrompt_template,
                )
question_answer_chain = create_stuff_documents_chain(
    llm=llm, 
    prompt = qa_prompt,
    document_prompt = customDocumentPrompt,
    document_variable_name = "context",
)

In [3]:
import weaviate
client = weaviate.connect_to_custom(
    http_host= os.environ.get("WEAVIATE_HOST"), #"10.100.224.34",  # URL only, no http prefix
    http_port= os.environ.get("WEAVIATE_HOST_PORT"), #"8080",
    http_secure=False ,   # Set to True if https
    grpc_host= os.environ.get("WEAVIATE_GPC_URL"),#  "10.100.224.34",
    grpc_port= os.environ.get("WEAVIATE_GPC_URL_PORT"),#"50051",      # Default is 50051, WCD uses 443
    grpc_secure=False,   # Edit as needed
    skip_init_checks=True,
)
vectorstore = WeaviateLC(
                client=client,
                index_name=INDEX,
                text_key='text',
                embedding=llm_embedding,
                use_multi_tenancy=True,
            )
vectorstore

<langchain_weaviate.vectorstores.WeaviateVectorStore at 0x7efcd02c5c30>

In [4]:
def answer_by_rag(
        question: str,
        tenant_id: str, 
        vectorstore: WeaviateLC,
        k: int,
):
        retrivee_docs = vectorstore.similarity_search(query=question, tenant=tenant_id, k=k)
        with tracing_v2_enabled(project_name = "generated_rag_multi_input_agenda"):
                output = question_answer_chain.invoke({
                        'input': question,
                        'context': retrivee_docs,
                })
        return output

In [None]:
# load file and tenant_id 
 # Read tenant_id 
import pandas as pd 
import json 
path = '/datadrive/CuongHV/project/mm_agenda_generation_research/rag_monitoring/tenant_id.csv'
data = pd.read_csv(path)
# fixed queries:
questions_list = [
  "What are the main objectives and focus areas of the meeting?",
  "What are the latest updates on assigned tasks and significant developments?",
  "What previous action items are still pending, and what challenges were faced?",
  "What major topics need to be discussed, and what insights have been shared?",
  "What challenges are currently being faced, and what solutions have been proposed?",
  "What key decisions need to be made, and what are their implications?",
  "What are the next steps, assigned tasks, and deadlines?",
  "What follow-up actions and plans are required for the next meeting?"
]
# Create the answer:
for index, row in data.iterrows():
    file = row['file']
    print(file)
    tenant_id = row['tenant_id']
    qa = {}
    for query in questions_list:
        answer = answer_by_rag(question=query, tenant_id=tenant_id, vectorstore= vectorstore, k=10)
        qa[query] = answer 
    data = {
        'file': file,
        'qa': qa,
    }
    with open(f'/datadrive/CuongHV/project/DATA/mm_agenda_generation_research_output/qa_by_rag/{file}', 'a', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    print(f"Finish {file}")

ES2004d.json
Finish ES2004d.json
ES2008b.json
Finish ES2008b.json
IS1004d.json
Finish IS1004d.json
IS1008a.json
Finish IS1008a.json
IS1004a.json
Finish IS1004a.json
IS1005b.json
Finish IS1005b.json
ES2008c.json
Finish ES2008c.json
ES2008d.json
Finish ES2008d.json
IS1006d.json
Finish IS1006d.json
IS1006c.json
Finish IS1006c.json
IS1005a.json
Finish IS1005a.json
ES2006d.json
Finish ES2006d.json
ES2004b.json
Finish ES2004b.json
ES2009d.json
Finish ES2009d.json
IS1005c.json
Finish IS1005c.json
ES2005a.json
Finish ES2005a.json
ES2006b.json
Finish ES2006b.json
IS1001c.json
Finish IS1001c.json
ES2003a.json
Finish ES2003a.json
IS1001d.json
Finish IS1001d.json
IS1001a.json
Finish IS1001a.json
ES2002b.json
Finish ES2002b.json
ES2004c.json
Finish ES2004c.json
ES2002a.json
Finish ES2002a.json
ES2003c.json
Finish ES2003c.json
ES2008a.json
Finish ES2008a.json
ES2006c.json
Finish ES2006c.json
ES2007d.json
Finish ES2007d.json
IS1009d.json
Finish IS1009d.json
IS1009b.json
Finish IS1009b.json
ES2005d.js

# Review the result 

In [11]:
import os 
root = '/datadrive/CuongHV/project/DATA/mm_agenda_generation_research_output/qa_by_rag'
files = os.listdir(root)
len(files)

67

In [9]:
import os 
root = '/datadrive/CuongHV/project/DATA/AMI_MS_RAG_Cleaned'
files = os.listdir(root)
len(files)

125

In [4]:
import json 
file_path = f'{root}/{files[0]}'
with open(file_path, 'r') as f:
    data = json.load(f)

data

{'file': 'ES2004d.json',
 'qa': {'What are the main objectives and focus areas of the meeting?': "The main objectives and focus areas of the meeting can be summarized as follows:\n\n1. **Design Decisions**: The meeting primarily aimed to finalize the design elements of a project, both internal and external. This included discussions on specific components such as the physical design of a product, which in one context involved decisions about features like the kinetic battery combo, LCD touch screen, and anti-RSI rubberized buttons.\n\n2. **Target Audience Identification**: Another key focus was identifying and discussing the target audiences that the project would appeal to. This aspect is crucial for tailoring the design and marketing strategies effectively.\n\n3. **Marketing Aims**: The meeting also addressed marketing objectives, ensuring that the design aligns with the intended market strategy and audience engagement.\n\n4. **Problem-Solving**: The meeting provided a platform to ad

In [5]:
len(data['qa'])

8

In [7]:
checked_data = []
for file in files:
    with open(f'{root}/{file}', 'r') as f:
        data = json.load(f)
    if len(data['qa']) == 8:
        checked_data.append(file)

In [8]:
len(checked_data)

67

In [14]:
file_path_0 = f'{root}/{files[0]}'
with open(f'{root}/{file}', 'r') as f:
    data = json.load(f)
print(data['qa']['What are the main objectives and focus areas of the meeting?'])

The main objectives and focus areas of the meeting can be summarized as follows:

1. **Product Development**: The meeting aimed to refine the specifications of the product, ensuring that it aligns with the initial requirements and expectations. This includes discussing the features and design elements that would appeal to the target market.

2. **Target Market Identification**: A significant focus was on identifying and agreeing upon the target market for the product. This is crucial as it influences the product's features, design, and marketing strategy.

3. **Evaluation of Product Appeal**: The meeting addressed the need for the final product to be visually appealing and marketable. Discussions highlighted the importance of aesthetics alongside functionality to ensure the product's success in the market.

4. **Budget Considerations**: Budget constraints were a recurring theme, with discussions on how to maintain cost-effectiveness while achieving the desired product specifications an

The main objectives and focus areas of the meeting can be summarized as follows:

1. **Product Development**: The meeting aimed to refine the specifications of the product, ensuring that it aligns with the initial requirements and expectations. This includes discussing the features and design elements that would appeal to the target market.

2. **Target Market Identification**: A significant focus was on identifying and agreeing upon the target market for the product. This is crucial as it influences the product's features, design, and marketing strategy.

3. **Evaluation of Product Appeal**: The meeting addressed the need for the final product to be visually appealing and marketable. Discussions highlighted the importance of aesthetics alongside functionality to ensure the product's success in the market.

4. **Budget Considerations**: Budget constraints were a recurring theme, with discussions on how to maintain cost-effectiveness while achieving the desired product specifications and features.

5. **Next Steps and Action Items**: The meeting concluded with outlining the next steps for each team member, focusing on gathering ideas and refining product characteristics based on the discussions held.

Overall, the meeting was centered on aligning the team’s vision for the product, addressing potential issues, and planning actionable steps to move forward effectively.

In [15]:
file_path_1 = f'{root}/{files[1]}'
with open(f'{root}/{file}', 'r') as f:
    data = json.load(f)
print(data['qa']['What are the main objectives and focus areas of the meeting?'])

The main objectives and focus areas of the meeting can be summarized as follows:

1. **Product Development**: The meeting aimed to refine the specifications of the product, ensuring that it aligns with the initial requirements and expectations. This includes discussing the features and design elements that would appeal to the target market.

2. **Target Market Identification**: A significant focus was on identifying and agreeing upon the target market for the product. This is crucial as it influences the product's features, design, and marketing strategy.

3. **Evaluation of Product Appeal**: The meeting addressed the need for the final product to be visually appealing and marketable. Discussions highlighted the importance of aesthetics alongside functionality to ensure the product's success in the market.

4. **Budget Considerations**: Budget constraints were a recurring theme, with discussions on how to maintain cost-effectiveness while achieving the desired product specifications an

The main objectives and focus areas of the meeting can be summarized as follows:

1. **Product Development**: The meeting aimed to refine the specifications of the product, ensuring that it aligns with the initial requirements and expectations. This includes discussing the features and design elements that would appeal to the target market.

2. **Target Market Identification**: A significant focus was on identifying and agreeing upon the target market for the product. This is crucial as it influences the product's features, design, and marketing strategy.

3. **Evaluation of Product Appeal**: The meeting addressed the need for the final product to be visually appealing and marketable. Discussions highlighted the importance of aesthetics alongside functionality to ensure the product's success in the market.

4. **Budget Considerations**: Budget constraints were a recurring theme, with discussions on how to maintain cost-effectiveness while achieving the desired product specifications and features.

5. **Next Steps and Action Items**: The meeting concluded with outlining the next steps for each team member, focusing on gathering ideas and refining product characteristics based on the discussions held.

Overall, the meeting was centered on aligning the team’s vision for the product, addressing potential issues, and planning actionable steps to move forward effectively.

# Based on qa_by_rag, and template, create the agenda

In [1]:
# load qa_by_rag
import json
def load_qa_from_file(file, root:str = "/datadrive/CuongHV/project/DATA/mm_agenda_generation_research_output/qa_by_rag"):
    with open(f'{root}/{file}', 'r') as f:
        data = json.load(f)
    lines = [f'Question:{q}\nAnswer:{a}' for q,a in data['qa'].items()]
    qa_text = '\n'.join(lines)
    return qa_text


In [2]:
qa_text = load_qa_from_file('ES2002a.json')
print(qa_text)

Question:What are the main objectives and focus areas of the meeting?
Answer:The main objectives and focus areas of the meeting can be summarized as follows:

1. **Finalization of Design Prototype**: The meeting aimed to finalize the design prototype of the product, ensuring that all team members were aligned on the final design elements.

2. **Review of Cost Constraints**: A significant focus was placed on reviewing the financial constraints that impacted the design, including discussions on which features could be retained or had to be removed due to budget limitations.

3. **Evaluation of Project Success**: The meeting included an evaluation of the overall success of the project, assessing whether the objectives set at the beginning were met and how the team performed throughout the project lifecycle.

4. **Clarification of Roles and Responsibilities**: There was a need to clarify the roles and responsibilities of team members, as some members expressed uncertainty about their speci

In [3]:
from src.utils.generate import Generation 
from src.utils import utils
file_config = utils.load_config("src/config/file_config.yml")
model_config = utils.load_config(file_config["llm_env"]["model_config_file"])
prompt_config = utils.load_config(file_config["llm_env"]["prompting_file"])
generate = Generation(prompt_config)

In [4]:
from src.utils.llm_models import get_llm_model
import os
from dotenv import load_dotenv
load_dotenv()
config = {
    'openai_api_key': os.getenv("OPENAI_API_KEY"),
    'llm_choice': 'OpenAI', # Ollama
    'model_choice': 'gpt-4o-mini', # gpt-4o
    'parameters': {
        'temperature': 0.2,
        'top_p': 0.95,
        'max_retries': 2,
    },
}
llm = get_llm_model(
    chatmodel=config['llm_choice'], 
    model_name=config['model_choice'], 
    param=config['parameters'], 
)

In [7]:
from langchain_core.tracers.context import tracing_v2_enabled

with tracing_v2_enabled(project_name="generate_rag_multi_input_agenda"):
    agenda = generate.generate_rag_multi_input_agenda(llm=llm, qa_text=qa_text)
agenda 

{'text': '**Meeting Agenda: Product Development Update**  \n\n1. **Introductions and Meeting Overview (5 minutes)**  \n  + **Overview of Meeting Purpose (2 minutes)**  \n    + Finalization of design prototype and alignment on key features.  \n    + Review of cost constraints and implications for design decisions.  \n  + **Team Introductions (3 minutes)**  \n    + Brief introductions of team members and their roles.  \n    + Overview of project objectives and current status.  \n\n2. **Project Updates (10 minutes)**  \n  + **Status Reports (5 minutes)**  \n    + Team members provide updates on their assigned tasks and significant developments.  \n    + Highlight any challenges faced and solutions proposed.  \n  + **Review of Previous Action Items (5 minutes)**  \n    + Discuss pending action items and clarify roles and responsibilities.  \n    + Address any outstanding issues from the last meeting.  \n\n3. **Discussion Topics (10 minutes)**  \n  + **Key Design Decisions (5 minutes)**  \n

In [6]:
print(agenda['text'])

**Meeting Agenda: Product Development Update**  

1. **Introductions and Meeting Purpose (5 minutes)**  
  + **Overview of Meeting Objectives (2 minutes)**  
    + Finalization of design prototype and alignment on design elements.  
    + Review of cost constraints and implications for product features.  
  + **Team Introductions (3 minutes)**  
    + Each member introduces themselves and their role in the project.  

2. **Project Updates (10 minutes)**  
  + **Status Reports on Assigned Tasks (5 minutes)**  
    + Team members provide updates on their assigned tasks and any significant developments.  
    + Highlight challenges faced and solutions proposed.  
  + **Review of Previous Action Items (5 minutes)**  
    + Discuss the status of pending action items and clarify roles and responsibilities.  
    + Address any challenges encountered since the last meeting.  

3. **Discussion Topics (10 minutes)**  
  + **Product Design and Functionality (5 minutes)**  
    + Discuss the final

# Create 'Rag' Agenda from 67 meetings

In [None]:
from src.utils.generate import Generation 
from src.utils import utils
from src.utils.llm_models import get_llm_model
import os
from dotenv import load_dotenv
load_dotenv()
file_config = utils.load_config("src/config/file_config.yml")
model_config = utils.load_config(file_config["llm_env"]["model_config_file"])
prompt_config = utils.load_config(file_config["llm_env"]["prompting_file"])
generate = Generation(prompt_config)
config = {
    'openai_api_key': os.getenv("OPENAI_API_KEY"),
    'llm_choice': 'OpenAI', # Ollama
    'model_choice': 'gpt-4o-mini', # gpt-4o
    'parameters': {
        'temperature': 0.2,
        'top_p': 0.95,
        'max_retries': 2,
    },
}
llm = get_llm_model(
    chatmodel=config['llm_choice'], 
    model_name=config['model_choice'], 
    param=config['parameters'], 
)

In [None]:
# load qa_by_rag
import json
def load_qa_from_file(file, root:str = "/datadrive/CuongHV/project/DATA/mm_agenda_generation_research_output/qa_by_rag"):
    with open(f'{root}/{file}', 'r') as f:
        data = json.load(f)
    lines = [f'Question:{q}\nAnswer:{a}' for q,a in data['qa'].items()]
    qa_text = '\n'.join(lines)
    return qa_text


In [13]:
from langchain_core.tracers.context import tracing_v2_enabled
from tqdm import tqdm
root = "/datadrive/CuongHV/project/DATA/mm_agenda_generation_research_output/qa_by_rag"
for file in tqdm(os.listdir(root)):
    qa_text = load_qa_from_file(file)
    with tracing_v2_enabled(project_name="generate_rag_multi_input_agenda"):
        agenda = generate.generate_rag_multi_input_agenda(llm=llm, qa_text=qa_text)
    data = {
        'qa_text': qa_text,
        'agenda': agenda['text'],
    }
    with open(f'/datadrive/CuongHV/project/DATA/mm_agenda_generation_research_output/generate_rag_multi_input_agenda/{file}', 'a', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)


100%|██████████| 67/67 [12:08<00:00, 10.88s/it]
