In [7]:
import json
import os
os.environ["OPENAI_API_KEY"] = 'Open-AI API KEY'

data_path = "train1.json"
with open(data_path, 'r') as f:
    data = json.load(f)

In [None]:
import pytorch

In [9]:
data['topic_list']

2

In [11]:
data['general_query_list']

[{'query': 'Summarize the whole meeting.',
  'answer': 'This was the kick-off meeting for the project. First of all, Project Manager led each group member to know each other and introduced the project which was aiming to design remote control. Next, they discussed their favourite animal characteristics. Lastly, Project Manager mentioned how they worked on each part individually.'}]

In [12]:
data['specific_query_list']

[{'query': "Summarize the groupmates' self-introduction and the project introduction.",
  'answer': 'There were four people in the project team and each one introduced to each other on the team role. Project Manager introduced the project was about designing a remote control. After that, Project Manager explained the work division for each person and how they would present in the coming meetings.',
  'relevant_text_span': [['0', '20']]},
 {'query': 'Summarize the job role for each groupmate.',
  'answer': 'The group was greeting each other at the first meeting. Laura was the Project Manager. David was Industrial Designer and Andrew was Marketing expert. And User Interface was named Craig.',
  'relevant_text_span': [['0', '9']]},
 {'query': 'What did the group discuss about the email they received on the project announcement?',
  'answer': 'Group mates all should have received an email introducing what was this project about and there would be three different stages to the design. The p

In [16]:
data['meeting_transcripts'][:5]

[{'speaker': 'Project Manager',
  'content': "Okay Right {vocalsound} Um well this is the kick-off meeting for our our project . Um {vocalsound} and um this is just what we're gonna be doing over the next twenty five minutes . Um so first of all , just to kind of make sure that we all know each other ,"},
 {'speaker': 'Marketing', 'content': 'Mm-hmm .'},
 {'speaker': 'Project Manager',
  'content': "I'm Laura and I'm the project manager . {vocalsound} Do you want to introduce yourself again ?"},
 {'speaker': 'Marketing', 'content': 'Great .'},
 {'speaker': 'Industrial Designer',
  'content': "Hi , I'm David and I'm supposed to be an industrial designer ."}]

In [2]:
def clean_data(text):
    text = text.replace('{vocalsound}', '')
    text = text.replace('{disfmarker} ', '')
    text = text.replace('a_m_i_', 'ami')
    text = text.replace('l_c_d_', 'lcd')
    text = text.replace('p_m_s', 'pms')
    text = text.replace('t_v_', 'tv')
    text = text.replace('{pause} ', '')
    text = text.replace('{nonvocalsound} ', '')
    text = text.replace('{gap} ', '')
    return text

def extract_text_from_json(json_data):
    data = []
    for turn, meeting in enumerate(json_data['meeting_transcripts']):
        if meeting:
            data.append({
                'turn': turn,
                'speaker': meeting['speaker'],
                'content': clean_data(meeting['content']),
            })
    return data

In [29]:
temp = extract_text_from_json(data)
with open("data.jsonl", "w") as f:
    for example in temp:
        f.write(f"{json.dumps(example)}\n")

In [30]:
import datasets
import sentence_transformers

def get_embeddings(batch, model):
    embeddings = model.encode(batch["text"])
    return {"embeddings": embeddings}

model = sentence_transformers.SentenceTransformer(
    "sentence-transformers/multi-qa-mpnet-base-dot-v1",
)

dataset = datasets.load_dataset("json", data_files="data.jsonl", split="train")
dataset = dataset.map(get_embeddings, batched=True, batch_size=32, fn_kwargs={"model": model})
dataset = dataset.with_format(
    type="numpy", columns=["embeddings"], output_all_columns=True,
)
dataset.add_faiss_index("embeddings")
dataset.save_faiss_index("embeddings", "index.faiss")

ModuleNotFoundError: No module named 'datasets'

In [3]:
from langchain.document_loaders import UnstructuredFileLoader
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import OpenAI
from langchain.document_loaders import JSONLoader

In [4]:
loader = JSONLoader(
    file_path='data.json',
    jq_schema='.[].content',
)
document = loader.load()
print(f'documents:{len(document)}')

documents:287


In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 0
)
# remove empty documents
split_documents = text_splitter.split_documents(document)
print(f'documents:{len(split_documents)}')

documents:257


In [6]:
split_documents[:5]

[Document(page_content="Okay Right  Um well this is the kick-off meeting for our our project . Um  and um this is just what we're gonna be doing over the next twenty five minutes . Um so first of all , just to kind of make sure that we all know each other ,", metadata={'source': '/afs/inf.ed.ac.uk/user/s17/s1756255/Attributed-QA/data.json', 'seq_num': 1}),
 Document(page_content='Mm-hmm .', metadata={'source': '/afs/inf.ed.ac.uk/user/s17/s1756255/Attributed-QA/data.json', 'seq_num': 2}),
 Document(page_content="I'm Laura and I'm the project manager .  Do you want to introduce yourself again ?", metadata={'source': '/afs/inf.ed.ac.uk/user/s17/s1756255/Attributed-QA/data.json', 'seq_num': 3}),
 Document(page_content='Great .', metadata={'source': '/afs/inf.ed.ac.uk/user/s17/s1756255/Attributed-QA/data.json', 'seq_num': 4}),
 Document(page_content="Hi , I'm David and I'm supposed to be an industrial designer .", metadata={'source': '/afs/inf.ed.ac.uk/user/s17/s1756255/Attributed-QA/data.j

In [8]:
llm = OpenAI(model_name="text-davinci-003", max_tokens=1500)

chain = load_summarize_chain(llm, chain_type="refine", verbose=True)

chain.run(split_documents)



[1m> Entering new RefineDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mWrite a concise summary of the following:


"Okay Right  Um well this is the kick-off meeting for our our project . Um  and um this is just what we're gonna be doing over the next twenty five minutes . Um so first of all , just to kind of make sure that we all know each other ,"


CONCISE SUMMARY:[0m

[1m> Finished chain.[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYour job is to produce a final summary
We have provided an existing summary up to a certain point:  This is a meeting to introduce the project and make sure everyone is on the same page. It will last 25 minutes and will involve introducing everyone involved.
We have the opportunity to refine the existing summary(only if needed) with some more context below.
------------
Mm-hmm .
------------
Given the new context, refine the original summary
If the 

"\n\nThe team is committed to creating a remote control that is original, trendy, user friendly, affordable, durable, and has features that go beyond the television, with a unique selling point that will make it stand out from the competition. To ensure the success of the project, the team will continue to assess customer feedback, trends, limited resources, usability issues, pricing strategies, production cost, and ways to measure success to create a product that is both aesthetically pleasing and technologically advanced. The team discussed the need for a reliable source of power and the potential need for additional features, with the main factor of customer satisfaction in mind. They also concluded the meeting by tasking the industrial designer to work on the remote control's design before the next meeting in 30 minutes. The team is committed to creating a product that is both aesthetically pleasing and technologically advanced, with features that go beyond the television, a unique

In [10]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain import OpenAI,VectorDBQA
from langchain.document_loaders import DirectoryLoader
from langchain.chains import RetrievalQA


In [20]:
embeddings = OpenAIEmbeddings()
#embedding and save locally
# docsearch = Chroma.from_documents(split_documents, embeddings, persist_directory='vector_store')
# docsearch.persist()

#load embedding
docsearch = Chroma(persist_directory="vector_store", embedding_function=embeddings)

In [15]:
embeddings

OpenAIEmbeddings(client=<class 'openai.api_resources.embedding.Embedding'>, model='text-embedding-ada-002', deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=None, openai_organization=None, allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=6, request_timeout=None, headers=None)

In [19]:
import datasets
import sentence_transformers

def get_embeddings(batch, model):
    embeddings = model.encode(batch["text"])
    return {"embeddings": embeddings}

model = sentence_transformers.SentenceTransformer(
    "sentence-transformers/multi-qa-mpnet-base-dot-v1",
)

dataset = datasets.load_dataset("json", data_files="data.jsonl", split="train")
dataset = dataset.map(get_embeddings, batched=True, batch_size=32, fn_kwargs={"model": model})
dataset = dataset.with_format(
    type="numpy", columns=["embeddings"], output_all_columns=True,
)
dataset.add_faiss_index("embeddings")
dataset.save_faiss_index("embeddings", "index.faiss")

ModuleNotFoundError: No module named 'sentence_transformers'

In [18]:
def get_embeddings(batch, model):
    embeddings = model.encode(batch["text"])
    return {"embeddings": embeddings}

dataset = datasets.load_dataset("json", data_files="data.jsonl", split="train")
dataset = dataset.map(get_embeddings, batched=True, batch_size=32, fn_kwargs={"model": embeddings})
dataset = dataset.with_format(
    type="numpy", columns=["embeddings"], output_all_columns=True,
)
dataset.add_faiss_index("embeddings")
dataset.save_faiss_index("embeddings", "index.faiss")

Found cached dataset json (/afs/inf.ed.ac.uk/user/s17/s1756255/.cache/huggingface/datasets/json/default-8c5965c0289d06f7/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)
                                                   

AttributeError: 'OpenAIEmbeddings' object has no attribute 'encode'

In [21]:
query = "what is the project name?"
docs = docsearch.similarity_search(query, include_metadata=True)

In [22]:
docs

[Document(page_content='Um , I just got the project announcement about what the project is  .', metadata={'source': '/afs/inf.ed.ac.uk/user/s17/s1756255/Attributed-QA/data.json', 'seq_num': 11}),
 Document(page_content="I'm Laura and I'm the project manager .  Do you want to introduce yourself again ?", metadata={'source': '/afs/inf.ed.ac.uk/user/s17/s1756255/Attributed-QA/data.json', 'seq_num': 3}),
 Document(page_content="Okay Right  Um well this is the kick-off meeting for our our project . Um  and um this is just what we're gonna be doing over the next twenty five minutes . Um so first of all , just to kind of make sure that we all know each other ,", metadata={'source': '/afs/inf.ed.ac.uk/user/s17/s1756255/Attributed-QA/data.json', 'seq_num': 1}),
 Document(page_content="Okay .  Um what are we doing next ? Uh um . Okay , uh we now need to discuss the project finance . Um so according to the brief um we're gonna be selling this remote control for twenty five Euro , um and we're aim

In [None]:
llm = OpenAI(temperature=0)
chain = load_qa_chain(llm, chain_type="stuff", verbose=True)
chain.run(input_documents=docs, question=query)

In [15]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

t5base = "google/flan-t5-base"

In [16]:
tokenizer = T5Tokenizer.from_pretrained(t5base, legacy=False)
model = T5ForConditionalGeneration.from_pretrained(t5base)

In [12]:
import pandas as pd
df = pd.read_csv('data/doc2dial/TEST/new_test.csv')

In [22]:
from dataloader import doc2dialDataset
from torch.utils.data import DataLoader
import json
import ast

In [17]:
df.iloc[0]

question                                    Can I get new plates?
answer          Yes, bring a copy of MV-78B to apply for new p...
model_answer         Report or Replace Lost or Stolen Plates [1].
ref                        [{'sp_id': '44', 'label': 'solution'}]
retrived_doc    How can I report stolen plates? \nAsk the poli...
doc_id                          Stolen and recovered vehicles#3_0
dial_id                          774a40ce2a5edd3b402199f4fc5d52ca
Name: 0, dtype: object

In [37]:
idx = 3
questions = df.iloc[idx]['question']
ans = df.iloc[idx]['answer']
model_ans = df.iloc[idx]['model_answer']
doc_data = json.load(open('data/doc2dial/doc2dial_doc.json', 'r'))

refs_ID = [term['sp_id'] for term in ast.literal_eval(df.iloc[idx]['ref'])]

doc_file_span = doc_data['doc_data']['dmv'][df.iloc[idx]['doc_id']]['spans']
ll = [doc_file_span[i] for i in refs_ID]

true_ref_string = [term['text_sp'] for term in ll]
concatenated_string = ''.join(string for sublist in true_ref_string for string in sublist)
true_ref_string

['You must pay the driver responsibility assessment whether you have a driver license issued by New York State, another jurisdiction or if you do not have a driver license. ']

In [38]:
from evaluation import infer_autoais_batch

In [39]:
questions

"What if I don't have a driver's license?"

In [40]:
infer_autoais_batch([questions], [ans], true_ref_string, tokenizer, model)

['yes']