In [1]:
import pandas as pd
import ast
from embedding import embedding
from dataloader import OpenQADataset
from torch.utils.data import DataLoader
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from langchain.vectorstores import FAISS

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
df = pd.read_csv('data/openqa/data.csv')

In [7]:
df.head()

Unnamed: 0,question,answer,passage,human_rating,paragraphs
0,who played hyde in league of extraordinary gen...,Jason Flemyng,Title: Jason Flemyng\nSection: Television and ...,Y,"[""Jason Iain Flemyng (born 25 September 1966) ..."
1,who signed the largest on the declaration of i...,John Hancock,Title: United States Declaration of Independen...,Y,"['The Declaration of Independence, headed The ..."
2,when was the last time the carolina hurricanes...,2009,Title: 2009 Stanley Cup playoffs\nSection: Con...,Y,['The 2009 Stanley Cup playoffs of the Nationa...
3,where was 2017 beauty and the beast filmed,England,Title: Beauty and the Beast (2017 film)\n\nA l...,Y,"[""Beauty and the Beast is a 2017 American musi..."
4,when does the next warrior book come out,2018,Title: Adonal Foyle\n\nAdonal David Foyle (bor...,N,"[""Adonal David Foyle (born March 9, 1975) is a..."


In [31]:
def get_title(passage):
    return passage.split('\n')[0]

# dic = {}

for idx, row in df.iterrows():
    if idx > 688:
        title = get_title(row['passage'])
        model='sentence-transformers/gtr-t5-base'
        embeddings = HuggingFaceEmbeddings(model_name = model)
        if title not in dic:
            dic[title] = [idx]

            paras = row['paragraphs']
            # question = row['question']
            paragraph_list = ast.literal_eval(paras)
            docs = [Document(page_content=paragraph_list[i], metadata={'p_idx':i}) for i in range(len(paragraph_list))]
            # build index
            db = FAISS.from_documents(docs, embeddings)
            try:
                db.save_local("data/faiss_index", index_name=title)
            except:
                print('failed to save {} {}'.format(title, idx))
        else:
            dic[title].append(idx)
            print('same title {} -> {}'.format(title, idx))



same title Title: Quantitative research -> 689
same title Title: Transit of Venus -> 690
same title Title: England at the Cricket World Cup -> 691
same title Title: Ek Anek Aur Ekta -> 692
same title Title: Sistine Chapel -> 693
same title Title: United States House of Representatives -> 694
same title Title: Berlin Blockade -> 695
same title Title: Drink -> 696
same title Title: Vietnam Chamber of Commerce and Industry -> 697
same title Title: Stephen Elop -> 698
same title Title: History of the Houston Astros -> 699
same title Title: Josefina López -> 700
same title Title: Culture of Jharkhand -> 701
same title Title: Four Big Pollution Diseases of Japan -> 702
same title Title: José Martí -> 703
same title Title: 2007 New England Patriots–New York Giants game -> 704
same title Title: Summer of '42 -> 705
same title Title: Phosphocreatine -> 706
same title Title: Triangle center -> 707
same title Title: User State Migration Tool -> 708
same title Title: Figure skating at the 1998 Win

In [32]:
len(dic)

985

In [13]:
df.loc[324]

question                when was the first election held in india
answer                                                    1951–52
passage         Title: 1951–52 elections in India\n\nThis arti...
human_rating                                                    Y
paragraphs      ['Independent India held its first elections i...
Name: 324, dtype: object

In [9]:
df.loc[426]

question        when were the first general elections held in ...
answer                                                       1951
passage         Title: 1951–52 elections in India\nSection: Ge...
human_rating                                                    Y
paragraphs      ['Independent India held its first elections i...
Name: 426, dtype: object

In [2]:
path = 'data/openqa/data.csv'
dataset = OpenQADataset(path)

In [3]:
dataloader = DataLoader(dataset, batch_size=2, shuffle=False)
batch = next(iter(dataloader))

In [9]:
passages = batch['passage']
[get_title(passage) for passage in passages]

['Title: Jason Flemyng', 'Title: United States Declaration of Independence']

In [10]:
questions = batch['question']
paragraphs = batch['paragraphs']

In [26]:
idx = 1

paras = paragraphs[idx]
question = questions[idx]
paragraph_list = ast.literal_eval(paras)
title = get_title(passages[idx])
print(title)
docs = [Document(page_content=paragraph_list[i], metadata={'p_idx':i}) for i in range(len(paragraph_list))]
# build index or read index
model='sentence-transformers/gtr-t5-base'
embeddings = HuggingFaceEmbeddings(model_name = model)
db = FAISS.from_documents(docs, embeddings)
db.save_local("data/faiss_index", index_name=title)

Title: United States Declaration of Independence


[Document(page_content="John Trumbull's painting Declaration of Independence has played a significant role in popular conceptions of the Declaration of Independence. The painting is 12-by-18-foot (3.7 by 5.5\xa0m) in size and was commissioned by the United States Congress in 1817; it has hung in the United States Capitol Rotunda since 1826. It is sometimes described as the signing of the Declaration of Independence, but it actually shows the Committee of Five presenting their draft of the Declaration to the Second Continental Congress on June 28, 1776, and not the signing of the document, which took place later.\n", metadata={'p_idx': 109}),
 Document(page_content='The first and most famous signature on the engrossed copy was that of John Hancock, President of the Continental Congress. Two future presidents (Thomas Jefferson and John Adams) and a father and great-grandfather of two other presidents (Benjamin Harrison V) were among the signatories. Edward Rutledge (age 26) was the young

In [32]:
db.similarity_search(question)

[Document(page_content="John Trumbull's painting Declaration of Independence has played a significant role in popular conceptions of the Declaration of Independence. The painting is 12-by-18-foot (3.7 by 5.5\xa0m) in size and was commissioned by the United States Congress in 1817; it has hung in the United States Capitol Rotunda since 1826. It is sometimes described as the signing of the Declaration of Independence, but it actually shows the Committee of Five presenting their draft of the Declaration to the Second Continental Congress on June 28, 1776, and not the signing of the document, which took place later.\n", metadata={'p_idx': 109}),
 Document(page_content='The first and most famous signature on the engrossed copy was that of John Hancock, President of the Continental Congress. Two future presidents (Thomas Jefferson and John Adams) and a father and great-grandfather of two other presidents (Benjamin Harrison V) were among the signatories. Edward Rutledge (age 26) was the young

In [16]:
title = get_title(passages[1])
title

'Title: United States Declaration of Independence'

In [27]:
question = questions[1]
title = get_title(passages[1])
model='sentence-transformers/gtr-t5-base'
embeddings = HuggingFaceEmbeddings(model_name = model)
new_db = FAISS.load_local("data/faiss_index", embeddings= embeddings, index_name=title)
new_db.similarity_search(question)

[Document(page_content="John Trumbull's painting Declaration of Independence has played a significant role in popular conceptions of the Declaration of Independence. The painting is 12-by-18-foot (3.7 by 5.5\xa0m) in size and was commissioned by the United States Congress in 1817; it has hung in the United States Capitol Rotunda since 1826. It is sometimes described as the signing of the Declaration of Independence, but it actually shows the Committee of Five presenting their draft of the Declaration to the Second Continental Congress on June 28, 1776, and not the signing of the document, which took place later.\n", metadata={'p_idx': 109}),
 Document(page_content='The first and most famous signature on the engrossed copy was that of John Hancock, President of the Continental Congress. Two future presidents (Thomas Jefferson and John Adams) and a father and great-grandfather of two other presidents (Benjamin Harrison V) were among the signatories. Edward Rutledge (age 26) was the young