In [38]:
from core.dataloaders.focus.focus_dataloader import FoCusDatasetV1

dataset = FoCusDatasetV1(
    input_dataset_path="./datasets/FoCus/train_focus.json"
)

valid_dataset = FoCusDatasetV1(
    input_dataset_path="./datasets/FoCus/valid_focus.json",
)

In [2]:
from core.utils import TfIdf, FoCusTfIdf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from pprint import pprint
from nltk import word_tokenize


  from .autonotebook import tqdm as notebook_tqdm


## standart sklearn TF-IDF vectorizer

In [3]:
def top_similar(corpus, X, query, top_k=1):
    similarity = cosine_similarity(X, query)
    similarity = similarity.flatten()
    similarity = np.argsort(similarity)[::-1][:top_k]
    similarity = similarity.tolist()

    similar_samples = [corpus[i] for i in similarity]
    return similar_samples

In [22]:
correct_top_1 = 0
correct_top_3 = 0
correct_top_5 = 0

for item in dataset:
    query = item["dialog"][-2]
    corpus = item["knowledge_candidates"]
    candidate_index = item["knowledge_answer_index"]
    true_candidate = corpus[candidate_index]
    # print(true_candidate)
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)
    query_vector = vectorizer.transform([query])
    top_1 = top_similar(corpus, X, query_vector, top_k=1)
    top_3 = top_similar(corpus, X, query_vector, top_k=3)
    top_5 = top_similar(corpus, X, query_vector, top_k=5)
    if true_candidate in top_1:
        correct_top_1 += 1
    if true_candidate in top_3:
        correct_top_3 += 1
    if true_candidate in top_5:
        correct_top_5 += 1
    # break

print(f"Top 1: {correct_top_1 / len(dataset)}")
print(f"Top 3: {correct_top_3 / len(dataset)}")
print(f"Top 5: {correct_top_5 / len(dataset)}")


Top 1: 0.4497383836660411
Top 3: 0.6737758061764204
Top 5: 0.7954416197463459


# remove stop words

In [23]:
word_tokenize("Test sentence")

['Test', 'sentence']

In [28]:
import spacy
nlp = spacy.load('en_core_web_sm')

all_stopwords = nlp.Defaults.stop_words
def clean_sentence(text):
    words = word_tokenize(text)
    words = [word for word in words if word.isalpha()]
    # words = [word for word in words if not word in all_stopwords]
    sentence = " ".join(words)
    return sentence



text = "Nick likes to play football, however he is not too fond of tennis."
clean_sentence(text)

'Nick likes to play football however he is not too fond of tennis'

In [30]:
correct_top_1 = 0
correct_top_3 = 0
correct_top_5 = 0

for item in dataset:
    query = item["dialog"][-2]
    query = clean_sentence(query)

    corpus = item["knowledge_candidates"]
    corpus = [clean_sentence(sent) for sent in corpus]
    
    candidate_index = item["knowledge_answer_index"]
    true_candidate = corpus[candidate_index]

    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)
    query_vector = vectorizer.transform([query])
    top_1 = top_similar(corpus, X, query_vector, top_k=1)
    top_3 = top_similar(corpus, X, query_vector, top_k=3)
    top_5 = top_similar(corpus, X, query_vector, top_k=5)
    # top_10 = top_similar(corpus, X, query_vector, top_k=10)
    if true_candidate in top_1:
        correct_top_1 += 1
    if true_candidate in top_3:
        correct_top_3 += 1
    if true_candidate in top_5:
        correct_top_5 += 1
    # print(true_candidate)
    # pprint(corpus)
    # pprint(true_candidate in top_10)
    # print(top_10)
    # break

print(f"Top 1: {correct_top_1 / len(dataset)}")
print(f"Top 3: {correct_top_3 / len(dataset)}")
print(f"Top 5: {correct_top_5 / len(dataset)}")

Top 1: 0.4494255815276119
Top 3: 0.6742450093840642
Top 5: 0.7950435079337997


## query with persona

In [36]:
correct_top_1 = 0
correct_top_3 = 0
correct_top_5 = 0

for item in dataset:
    query = item["dialog"][-2]
    persona = item["persona"]
    persona = " ".join(persona)
    query = persona + " " + query

    corpus = item["knowledge_candidates"]
    candidate_index = item["knowledge_answer_index"]
    true_candidate = corpus[candidate_index]
    # print(true_candidate)
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)
    query_vector = vectorizer.transform([query])
    top_1 = top_similar(corpus, X, query_vector, top_k=1)
    top_3 = top_similar(corpus, X, query_vector, top_k=3)
    top_5 = top_similar(corpus, X, query_vector, top_k=5)
    if true_candidate in top_1:
        correct_top_1 += 1
    if true_candidate in top_3:
        correct_top_3 += 1
    if true_candidate in top_5:
        correct_top_5 += 1
    # print(persona)
    # print(query)
    # break

print(f"Top 1: {correct_top_1 / len(dataset)}")
print(f"Top 3: {correct_top_3 / len(dataset)}")
print(f"Top 5: {correct_top_5 / len(dataset)}")


Top 1: 0.6005374509469373
Top 3: 0.8148922254450321
Top 5: 0.9023346414150031


In [48]:
correct_top_1 = 0
correct_top_3 = 0
correct_top_5 = 0

for item in valid_dataset:
    query = item["dialog"][-2]
    persona = item["persona"]
    persona = " ".join(persona)
    query = persona + " " + query

    corpus = item["knowledge_candidates"]
    candidate_index = item["knowledge_answer_index"]
    true_candidate = corpus[candidate_index]
    # print(true_candidate)
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)
    query_vector = vectorizer.transform([query])
    top_1 = top_similar(corpus, X, query_vector, top_k=1)
    top_3 = top_similar(corpus, X, query_vector, top_k=3)
    top_5 = top_similar(corpus, X, query_vector, top_k=5)
    if true_candidate in top_1:
        correct_top_1 += 1
    if true_candidate in top_3:
        correct_top_3 += 1
    if true_candidate in top_5:
        correct_top_5 += 1
    # print(persona)
    # print(query)
    # break

print(f"Top 1: {correct_top_1 / len(valid_dataset)}")
print(f"Top 3: {correct_top_3 / len(valid_dataset)}")
print(f"Top 5: {correct_top_5 / len(valid_dataset)}")

Top 1: 0.6079092037595318
Top 3: 0.8251463025359106
Top 5: 0.9109771236034758


In [51]:
correct_top_1 = 0
correct_top_3 = 0
correct_top_5 = 0

for item in valid_dataset:
    query = item["dialog"][-2]
    persona = item["persona"]
    persona = " ".join(persona)
    query = persona + " " + query
    query = clean_sentence(query)

    corpus = item["knowledge_candidates"]
    corpus = [clean_sentence(sent) for sent in corpus]
    candidate_index = item["knowledge_answer_index"]
    true_candidate = corpus[candidate_index]
    # print(true_candidate)
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)
    query_vector = vectorizer.transform([query])
    top_1 = top_similar(corpus, X, query_vector, top_k=1)
    top_3 = top_similar(corpus, X, query_vector, top_k=3)
    top_5 = top_similar(corpus, X, query_vector, top_k=5)
    if true_candidate in top_1:
        correct_top_1 += 1
    if true_candidate in top_3:
        correct_top_3 += 1
    if true_candidate in top_5:
        correct_top_5 += 1
    # print(persona)
    # print(query)
    # break

print(f"Top 1: {correct_top_1 / len(valid_dataset)}")
print(f"Top 3: {correct_top_3 / len(valid_dataset)}")
print(f"Top 5: {correct_top_5 / len(valid_dataset)}")

Top 1: 0.6123426139386416
Top 3: 0.8233729384642667
Top 5: 0.9092037595318319


In [52]:
correct_top_1 = 0
correct_top_3 = 0
correct_top_5 = 0

for item in dataset:
    query = item["dialog"][-2]
    persona = item["persona"]
    persona = " ".join(persona)
    query = persona + " " + query
    query = clean_sentence(query)

    corpus = item["knowledge_candidates"]
    corpus = [clean_sentence(sent) for sent in corpus]
    candidate_index = item["knowledge_answer_index"]
    true_candidate = corpus[candidate_index]
    # print(true_candidate)
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)
    query_vector = vectorizer.transform([query])
    top_1 = top_similar(corpus, X, query_vector, top_k=1)
    top_3 = top_similar(corpus, X, query_vector, top_k=3)
    top_5 = top_similar(corpus, X, query_vector, top_k=5)
    if true_candidate in top_1:
        correct_top_1 += 1
    if true_candidate in top_3:
        correct_top_3 += 1
    if true_candidate in top_5:
        correct_top_5 += 1
    # print(persona)
    # print(query)
    # break

print(f"Top 1: {correct_top_1 / len(dataset)}")
print(f"Top 3: {correct_top_3 / len(dataset)}")
print(f"Top 5: {correct_top_5 / len(dataset)}")

Top 1: 0.5998976283910595
Top 3: 0.8158732866973781
Top 5: 0.901936529602457


# TF-IDF with transformer tokenizer

In [1]:
from transformers import BartTokenizer
from core.utils import FoCusTfIdf

tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
original_texts = [
    "The fort was built in response to the growth of overland emigration to Oregon after 1845. The fort was built in response to the growth of overland emigration to Oregon after 1845. Livestock could be traded for fresh stock and letters sent back to the States.",
    "The Truman Galusha House, also called the Truman Galusha Mansion and \"Fairview\" in various historical documents and maps, is a Federal-style house in Jericho, Vermont, United States. It was listed on the National Register of Historic Places in 1978 as the Galusha House, qualifying for designation based on its \"architectural excellence\" and the association of its early owners with important early governors and other key leaders involved with the creation of the state of Vermont. It was built in 1790, and is named for the son of an early Vermont governor, Jonas Galusha.",
    "The Machzike Hadath Synagogue moved to Golders Green in the 1970s, opening its present building in 1983. A significant moment in Temple Fortune's development into a suburban area occurred in 1907, when transport links were vastly improved by the opening of Golders Green Underground station. [citation needed] Golders Green is home to a growing Japanese and East Asian community with many families living in the district being catered for a notable number of restaurants and shops specialising in Japanese and other East Asian food, such as the Seoul Plaza supermarket.",
    "An alternative interpretation of the three-bodied goddess in Gangadharamurti panel here and elsewhere is that it represents the regenerative powers of rivers in the form of Mandakini, Suradhani and Bhagavati. The relief is much ruined below the waist, is 3.5\u00a0m (11\u00a0ft) high and posed in action. On the east side of the main hall is a separate shrine.",
    "With the market for expensive luxury cars severely undercut by the Great Depression, Duesenberg folded in 1937. It was so reputed and imposing that many Hollywood stars, such as James Cagney, posed next to the car to promote their careers. It was also both the fastest and most expensive American automobile in the market.",
]
query = "Wow, this is amazing! What is this?"
corpus = tokenizer.batch_encode_plus(
    original_texts, 
    padding=False, 
    truncation=True,
    add_special_tokens=False,
)

query = tokenizer.batch_encode_plus(
    [query], 
    padding=False, 
    truncation=True,
    add_special_tokens=False,
)

tf_idf = FoCusTfIdf(corpus=corpus['input_ids'])

tf_idf.top_similar(
    query=query['input_ids'], 
    top_k=1,
    return_indices=True,
)

  from .autonotebook import tqdm as notebook_tqdm


[3]

In [3]:
correct_top_1 = 0
correct_top_3 = 0
correct_top_5 = 0

for item in valid_dataset:
    query = item["dialog"][-2]
    persona = item["persona"]
    persona = " ".join(persona)
    query = persona + " " + query
    encoded_query = tokenizer.batch_encode_plus(
        [query],
        padding=False, 
        truncation=False,
        add_special_tokens=False,
    )

    corpus = item["knowledge_candidates"]
    encoded_corpus = tokenizer.batch_encode_plus(
        corpus,
        padding=False, 
        truncation=False,
        add_special_tokens=False,
        # max_length=100,
    )
    candidate_index = item["knowledge_answer_index"]
    true_candidate = corpus[candidate_index]
    # print(true_candidate)
    tf_idf = FoCusTfIdf(corpus=encoded_corpus['input_ids'])

    top_1 = tf_idf.top_similar(query=encoded_query['input_ids'], top_k=1,return_indices=True)
    # print(top_1)
    top_1 = [corpus[i] for i in top_1]
    top_3 = tf_idf.top_similar(query=encoded_query['input_ids'], top_k=3,return_indices=True)
    top_3 = [corpus[i] for i in top_3]
    top_5 = tf_idf.top_similar(query=encoded_query['input_ids'], top_k=5,return_indices=True)
    top_5 = [corpus[i] for i in top_5]
    
    if true_candidate in top_1:
        correct_top_1 += 1
    if true_candidate in top_3:
        correct_top_3 += 1
    if true_candidate in top_5:
        correct_top_5 += 1
    # print(persona)
    # print(query)
    break

print(f"Top 1: {correct_top_1 / len(valid_dataset)}")
print(f"Top 3: {correct_top_3 / len(valid_dataset)}")
print(f"Top 5: {correct_top_5 / len(valid_dataset)}")

Token indices sequence length is longer than the specified maximum sequence length for this model (1072 > 1024). Running this sequence through the model will result in indexing errors


Top 1: 0.5114381982621032
Top 3: 0.5114381982621032
Top 5: 0.5114381982621032


### use sentence transformers

In [3]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

# Two lists of sentences
sentences1 = [
    'The new movie is awesome',
    'A man is playing guitar',
    'The cat sits outside',
]

sentences2 = ['The dog plays in the garden',]

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)
cosine_scores
# #Output the pairs with their score
# for i in range(len(sentences1)):
#     print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))

tensor([[0.0543],
        [0.2277],
        [0.2838]], device='cuda:0')

In [17]:
cosine_scores.topk(2, dim=0)

torch.return_types.topk(
values=tensor([[0.2838],
        [0.2277]], device='cuda:0'),
indices=tensor([[2],
        [1]], device='cuda:0'))

In [None]:
def top_similar_sentences(X, query, top_k=1):
    

In [25]:
correct_top_1 = 0
correct_top_3 = 0
correct_top_5 = 0

for item in valid_dataset:
    query = item["dialog"][-2]
    persona = item["persona"]
    persona = " ".join(persona)
    query = persona + " " + query

    corpus = item["knowledge_candidates"]
    candidate_index = item["knowledge_answer_index"]
    true_candidate = corpus[candidate_index]

    query_emb = model.encode([query], convert_to_tensor=True)
    corpus_emb = model.encode(corpus, convert_to_tensor=True)

    cosine_scores = util.cos_sim(corpus_emb, query_emb)
    top_indices = cosine_scores.topk(5, dim=0).indices.flatten().tolist()
    # print(top_indices)
    top_sentences = [corpus[i] for i in top_indices]
    top_1 = top_sentences[:1]
    top_3 = top_sentences[:3]
    top_5 = top_sentences[:5]
    
    if true_candidate in top_1:
        correct_top_1 += 1
    if true_candidate in top_3:
        correct_top_3 += 1
    if true_candidate in top_5:
        correct_top_5 += 1
    # break

print(f"Top 1: {correct_top_1 / len(valid_dataset)}")
print(f"Top 3: {correct_top_3 / len(valid_dataset)}")
print(f"Top 5: {correct_top_5 / len(valid_dataset)}")

Top 1: 0.9343855293491754
Top 3: 0.9863450966483419
Top 5: 0.9964532718567122


In [27]:
correct_top_1 = 0
correct_top_3 = 0
correct_top_5 = 0

for item in dataset:
    query = item["dialog"][-2]
    persona = item["persona"]
    persona = " ".join(persona)
    query = persona + " " + query

    corpus = item["knowledge_candidates"]
    candidate_index = item["knowledge_answer_index"]
    true_candidate = corpus[candidate_index]

    query_emb = model.encode([query], convert_to_tensor=True)
    corpus_emb = model.encode(corpus, convert_to_tensor=True)

    cosine_scores = util.cos_sim(corpus_emb, query_emb)
    top_indices = cosine_scores.topk(5, dim=0).indices.flatten().tolist()
    # print(top_indices)
    top_sentences = [corpus[i] for i in top_indices]
    top_1 = top_sentences[:1]
    top_3 = top_sentences[:3]
    top_5 = top_sentences[:5]
    
    if true_candidate in top_1:
        correct_top_1 += 1
    if true_candidate in top_3:
        correct_top_3 += 1
    if true_candidate in top_5:
        correct_top_5 += 1
    # break

print(f"Top 1: {correct_top_1 / len(dataset)}")
print(f"Top 3: {correct_top_3 / len(dataset)}")
print(f"Top 5: {correct_top_5 / len(dataset)}")

Top 1: 0.9348518455326167
Top 3: 0.9883978843200819
Top 5: 0.9964454302451231


In [28]:
correct_top_1 = 0
correct_top_3 = 0
correct_top_5 = 0

for item in valid_dataset:
    query = item["dialog"][-2]
    persona = item["persona"]
    persona = " ".join(persona)

    corpus = item["knowledge_candidates"]
    candidate_index = item["knowledge_answer_index"]
    true_candidate = corpus[candidate_index]

    query_emb = model.encode([query], convert_to_tensor=True)
    corpus_emb = model.encode(corpus, convert_to_tensor=True)

    cosine_scores = util.cos_sim(corpus_emb, query_emb)
    top_indices = cosine_scores.topk(5, dim=0).indices.flatten().tolist()
    # print(top_indices)
    top_sentences = [corpus[i] for i in top_indices]
    top_1 = top_sentences[:1]
    top_3 = top_sentences[:3]
    top_5 = top_sentences[:5]
    
    if true_candidate in top_1:
        correct_top_1 += 1
    if true_candidate in top_3:
        correct_top_3 += 1
    if true_candidate in top_5:
        correct_top_5 += 1
    # break

print(f"Top 1: {correct_top_1 / len(valid_dataset)}")
print(f"Top 3: {correct_top_3 / len(valid_dataset)}")
print(f"Top 5: {correct_top_5 / len(valid_dataset)}")

Top 1: 0.6586274162085476
Top 3: 0.8315304131938287
Top 5: 0.9207306259975173


In [34]:
correct_top_1 = 0
correct_top_2 = 0
correct_top_3 = 0
correct_top_5 = 0

for item in valid_dataset:
    query = item["dialog"][-2]
    persona = item["persona"]
    persona = " ".join(persona)
    query = query + " " + persona

    corpus = item["knowledge_candidates"]
    candidate_index = item["knowledge_answer_index"]
    true_candidate = corpus[candidate_index]

    query_emb = model.encode([query], convert_to_tensor=True)
    corpus_emb = model.encode(corpus, convert_to_tensor=True)

    cosine_scores = util.cos_sim(corpus_emb, query_emb)
    top_indices = cosine_scores.topk(5, dim=0).indices.flatten().tolist()
    # print(top_indices)
    top_sentences = [corpus[i] for i in top_indices]
    top_1 = top_sentences[:1]
    top_2 = top_sentences[:2]
    top_3 = top_sentences[:3]
    top_5 = top_sentences[:5]
    
    if true_candidate in top_1:
        correct_top_1 += 1
    if true_candidate in top_2:
        correct_top_2 += 1
    if true_candidate in top_3:
        correct_top_3 += 1
    if true_candidate in top_5:
        correct_top_5 += 1
    # break

print(f"Top 1: {correct_top_1 / len(valid_dataset)}")
print(f"Top 2: {correct_top_2 / len(valid_dataset)}")
print(f"Top 3: {correct_top_3 / len(valid_dataset)}")
print(f"Top 5: {correct_top_5 / len(valid_dataset)}")

Top 1: 0.9347402021635042
Top 2: 0.9746408937754921
Top 3: 0.9863450966483419
Top 5: 0.9962759354495478


In [31]:
best_model = SentenceTransformer('all-mpnet-base-v2')

Downloading: 100%|██████████| 1.18k/1.18k [00:00<00:00, 385kB/s]
Downloading: 100%|██████████| 190/190 [00:00<00:00, 50.9kB/s]
Downloading: 100%|██████████| 10.6k/10.6k [00:00<00:00, 2.84MB/s]
Downloading: 100%|██████████| 571/571 [00:00<00:00, 170kB/s]
Downloading: 100%|██████████| 116/116 [00:00<00:00, 35.0kB/s]
Downloading: 100%|██████████| 39.3k/39.3k [00:00<00:00, 263kB/s] 
Downloading: 100%|██████████| 438M/438M [01:26<00:00, 5.08MB/s] 
Downloading: 100%|██████████| 53.0/53.0 [00:00<00:00, 16.7kB/s]
Downloading: 100%|██████████| 239/239 [00:00<00:00, 76.3kB/s]
Downloading: 100%|██████████| 466k/466k [00:00<00:00, 805kB/s] 
Downloading: 100%|██████████| 363/363 [00:00<00:00, 94.3kB/s]
Downloading: 100%|██████████| 13.1k/13.1k [00:00<00:00, 3.66MB/s]
Downloading: 100%|██████████| 232k/232k [00:00<00:00, 400kB/s]  
Downloading: 100%|██████████| 349/349 [00:00<00:00, 115kB/s]


In [33]:
correct_top_1 = 0
correct_top_2 = 0
correct_top_3 = 0
correct_top_5 = 0

for item in valid_dataset:
    query = item["dialog"][-2]
    persona = item["persona"]
    persona = " ".join(persona)
    query = query + " " + persona

    corpus = item["knowledge_candidates"]
    candidate_index = item["knowledge_answer_index"]
    true_candidate = corpus[candidate_index]

    query_emb = best_model.encode([query], convert_to_tensor=True)
    corpus_emb = best_model.encode(corpus, convert_to_tensor=True)

    cosine_scores = util.cos_sim(corpus_emb, query_emb)
    top_indices = cosine_scores.topk(5, dim=0).indices.flatten().tolist()
    # print(top_indices)
    top_sentences = [corpus[i] for i in top_indices]
    top_1 = top_sentences[:1]
    top_3 = top_sentences[:3]
    top_2 = top_sentences[:2]
    top_5 = top_sentences[:5]
    
    if true_candidate in top_1:
        correct_top_1 += 1
    if true_candidate in top_2:
        correct_top_2 += 1
    if true_candidate in top_3:
        correct_top_3 += 1
    if true_candidate in top_5:
        correct_top_5 += 1
    # break

print(f"Top 1: {correct_top_1 / len(valid_dataset)}")
print(f"Top 2: {correct_top_2 / len(valid_dataset)}")
print(f"Top 3: {correct_top_3 / len(valid_dataset)}")
print(f"Top 5: {correct_top_5 / len(valid_dataset)}")

Top 1: 0.9413016492285866
Top 2: 0.9785422947331087
Top 3: 0.9907785068274517
Top 5: 0.9975172902996985


## persona classification

In [5]:
# valid_dataset[0]

In [37]:
import torch
persona_correct = 0

for item in valid_dataset:
    query = item["dialog"][-2]
    persona = item["persona"]
    persona_grounding = item["persona_grounding"]
    true_persona = [pers for pers, ground in zip(persona, persona_grounding) if ground == 1]
    # persona = " ".join(persona)
    # query = query + " " + persona

    corpus = item["knowledge_candidates"]
    candidate_index = item["knowledge_answer_index"]
    true_candidate = corpus[candidate_index]
    query = query + " " + true_candidate

    # query_emb = model.encode([query], convert_to_tensor=True)
    # corpus_emb = model.encode(persona, convert_to_tensor=True)

    # cosine_scores = util.cos_sim(corpus_emb, query_emb)

    # print( == [0, 0, 0, 1, 0])
    # print(persona_grounding)
    # predicts = (cosine_scores > 0.9).flatten().int()
    predicts = torch.tensor([0, 0, 0, 0, 0]).to(predicts.device)
    persona_grounding = torch.tensor(persona_grounding).to(predicts.device)
    local_acc = (predicts == persona_grounding).sum().item() / 5
    # if predicts == persona_grounding:
    #     persona_correct += 1
    persona_correct += local_acc
    
    # break

print(f"Persona accuracy: {persona_correct / len(valid_dataset)}")

Persona accuracy: 0.8669976946267617


In [39]:
import torch
persona_correct = 0

for item in dataset:
    query = item["dialog"][-2]
    persona = item["persona"]
    persona_grounding = item["persona_grounding"]
    true_persona = [pers for pers, ground in zip(persona, persona_grounding) if ground == 1]

    corpus = item["knowledge_candidates"]
    candidate_index = item["knowledge_answer_index"]
    true_candidate = corpus[candidate_index]
    query = query + " " + true_candidate
    predicts = torch.tensor([0, 0, 0, 0, 0]).to(predicts.device)
    persona_grounding = torch.tensor(persona_grounding).to(predicts.device)
    local_acc = (predicts == persona_grounding).sum().item() / 5
    persona_correct += local_acc

print(f"Persona accuracy: {persona_correct / len(dataset)}")

Persona accuracy: 0.8654581129506077


### explore knowledge dataleak

In [41]:
valid_dataset[0]

{'persona': ['I would like to visit the Nazareth House again.',
  'I love Benevolent institutions.',
  'I am interested in History.',
  'I have curiosity about the Description of this place.',
  'I would like to know when it was Built.'],
 'knowledge_candidates': ['Nazareth House is a heritage-listed benevolent institution at 272 Wynnum North Road, Wynnum, City of Brisbane, Queensland, Australia.',
  'However, in many cases, a hearing is not held.',
  'The church and school buildings are listed together as a Cleveland Designated Landmark.',
  "Until the reorganisation of London's local government in 1965, Muswell Hill formed part of the Borough of Hornsey within the administrative county of Middlesex.",
  'This operation enabled the Canadian Sulpicians to expand their primary work, the education of priests.',
  "Bosworth's design was heavily Greek-influenced: though the facade is made of white Vermont granite, it features layers of gray granite columns in Doric and Ionic styles, as wel

In [53]:
knowledge_correct = 0

for item in valid_dataset:
    knowledge_candidates = item["knowledge_candidates"]
    knowledge_index = item["knowledge_answer_index"]
    knowledge = item["knowledge"]
    predict_index = 0
    stop = False

    for i, candidate in enumerate(knowledge_candidates):
        for knowledge_item in knowledge:
            if candidate in knowledge_item:
                predict_index = i
                stop = True
                break
        if stop:
            break

    if predict_index == knowledge_index:
        knowledge_correct += 1
  
    # break
print("Knowledge accuracy: ", knowledge_correct / len(valid_dataset))

Knowledge accuracy:  0.7226458591948927
