In [55]:
from torch.utils.data import Dataset
from typing import List, Dict
import json
from transformers import AutoTokenizer, BartTokenizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from itertools import chain

class TF_IDF:
	def __init__(self, 
			corpus: List[List[int]],
		) -> None:
		"""
		Example usage:
			corpus = [
				[1, 2, 3, 4],
				[1, 2, 3],
				[1, 2],
			]

			tf_idf = TF_IDF(corpus)
			
			similar_sentences = tf_idf.get_similar([1, 2, 3], n=3)
			>>> similar_sentences
			[
				[1, 2, 3],
				[1, 2, 3, 4],
				[1, 2]
			]

		Args:
			corpus (List[List[int]]): токенизированный корпус
		"""
		self.vectorizer = TfidfVectorizer(
			# token_pattern is number
			token_pattern=r"(?u)\b\d+\b", 
		)
		new_corpus = self.__encode_sentences(corpus)

		self.X = self.vectorizer.fit_transform(new_corpus)
		self.corpus = corpus
	
	def __encode_sentence(self, sentence: List[int]) -> str:
		return " ".join(list(map(str, sentence)))

	def __encode_sentences(self, sentences: List[List[int]]) -> List[str]:
		return list(map(self.__encode_sentence, sentences))
	
	def top_similar(self, 
			query: List[List[int]] = None,
			top_k: int = 1,
		) -> List[List[int]]:
		query = self.__encode_sentences(query)
		query = self.vectorizer.transform(query)
		
		similarity = cosine_similarity(self.X, query)
		similarity = similarity.flatten()
		similarity = np.argsort(similarity)[::-1][:top_k]
		similarity = similarity.tolist()

		similar_samples = [ self.corpus[i] for i in similarity ]
		return similar_samples


class FoCusDatasetSampleV1:
	__slots__ = (
		'persona', 
		'knowledge_candidates',  
		'persona_grounding', 
		'dialog', 
		'knowledge_answer_index',
		"knowledge"
	)

	def __init__(self, 
			persona: List[str],
			knowledge_candidates: List[str],
			persona_grounding: List[int],
			dialog: List[str],
			knowledge: List[str],
			knowledge_answer_index: int,
		) -> None:
		self.persona = persona
		self.knowledge_candidates = knowledge_candidates
		self.persona_grounding = persona_grounding
		self.knowledge_answer_index = knowledge_answer_index
		self.dialog = dialog
		self.knowledge = knowledge
	
	def get_dict(self) -> dict:
		return {
			'persona': self.persona,
			'knowledge_candidates': self.knowledge_candidates,
			'persona_grounding': self.persona_grounding,
			'dialog': self.dialog,
			'knowledge_answer_index': self.knowledge_answer_index,
			'knowledge': self.knowledge,
		}

class FoCusDatasetV1:
	def __init__(self,
		input_dataset_path: str = None,
		) -> None:
		assert input_dataset_path is not None, 'input_dataset_path is None'

		self.input_dataset_path = input_dataset_path
		self.dataset: List[FoCusDatasetSampleV1] = []

		self.__build_dataset()
	
	def __build_dataset(self) -> None:
		initial_train_dataset = self.__read_dataset(self.input_dataset_path)
		self.dataset = self.__create_initial_dataset(initial_train_dataset)
	
	def __create_initial_dataset(self, initial_dataset: Dict) -> List[FoCusDatasetSampleV1]:
		dataset = []
		initial_dataset_data = initial_dataset['data']
		
		for i, dialog_set in enumerate(initial_dataset_data):
			persona = dialog_set['persona']
			utterances = dialog_set['utterance']
			knowledge = dialog_set['knowledge']
			
			for j, utterance in enumerate(utterances):
				persona_grounding = list(map(int, utterance['persona_grounding']))
				knowledge_candidates = utterance['knowledge_candidates']
				knowledge_answer_index = utterance['knowledge_answer_index']
				dialog_index_key = [item for item in utterance.keys() if 'dialog' in item][0]
				dialog = utterance[dialog_index_key]
				
				data_sample = FoCusDatasetSampleV1(
					persona=persona,
					knowledge_candidates=knowledge_candidates,
					persona_grounding=persona_grounding,
					dialog=dialog,
					knowledge_answer_index=knowledge_answer_index,
					knowledge=knowledge,
				)
				data_sample = data_sample.get_dict()
				dataset.append(data_sample)
		
		return dataset
	
	def __read_dataset(self, input_path: str) -> list:
		with open(input_path, 'r') as f:
			dataset = json.load(f)
		return dataset

class PytorchFoCusDatasetV1(Dataset):
	def __init__(self, 
		dataset: FoCusDatasetV1,
		) -> None:
		self.dataset = dataset
		self.bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
		self.bart_hyperparameters = BartFoCusDatasetSampleHyperparameters()
	
	def __len__(self) -> int:
		return len(self.dataset)
	
	def __getitem__(self, index: int) -> FoCusDatasetSampleV1:
		dataset_sample: FoCusDatasetSampleV1 = self.dataset[index]
		train_sample: BartFoCusDatasetSampleV1 = BartFoCusDatasetSampleV1(
			focus_dataset_sample=dataset_sample,
			tokenizer=self.bart_tokenizer,
			hyperparameters=self.bart_hyperparameters,
		)
		return train_sample

In [None]:
{'persona': [
	'I would like to visit the Nazareth House again.',
	'I love Benevolent institutions.',
	'I am interested in History.',
	'I have curiosity about the Description of this place.',
	'I would like to know when it was Built.'
],
 'knowledge_candidates': [
	'Nazareth House is a heritage-listed benevolent institution at 272 Wynnum North Road, Wynnum, City of Brisbane, Queensland, Australia.',
  	'However, in many cases, a hearing is not held.',
  	'The church and school buildings are listed together as a Cleveland Designated Landmark.',
  	"Until the reorganisation of London's local government in 1965, Muswell Hill formed part of the Borough of Hornsey within the administrative county of Middlesex.",
  	'This operation enabled the Canadian Sulpicians to expand their primary work, the education of priests.',
  	"Bosworth's design was heavily Greek-influenced: though the facade is made of white Vermont granite, it features layers of gray granite columns in Doric and Ionic styles, as well as various Greek-inspired ornamentation.",
  	'The Insurance Hall is designated as a Grade II listed building, in part due to these murals.',
  	'It has been pointed out that this need could have been met with the man-made Stagnum (lake) of Agrippa or, more likely, the Euripus (canal) which allowed for runoff from the Stagnum to flow into the Tiber (please see below for more information on both the Stagnum and the Euripus).',
  	'By 1217, documents show that the castle at Almeida is one of several strong points that guard the border between Spain and Portugal.',
  	'The Riverwalk runs along much of the Brisbane River foreshore throughout the inner-city area, with the longest span running between Newstead and Toowong.'],
 'persona_grounding': [1, 0, 0, 0, 0],
 'dialog': [
	"I think I've been there before but I don't remember the name of this place.",
  	'This place is the Nazareth House, which you would like to visit again.'],
 'knowledge_answer_index': 0
 }

In [51]:
temp = FoCusDatasetV1(input_dataset_path='./datasets/FoCus/valid_focus.json')
temp = temp.dataset[100]

What is Trasimeno's castles?
There are castles all around Trasimeno, many in the center of small towns while others are isolated and in ruins. Castiglione del Lago, Passignano, Magione, Maggiore, and Polvese islands all have castles, while Zocco castle, Montali castle, and others are on hilltops.


In [45]:


corpus = [
    [1, 2, 3, 4],
	[1, 2, 3],
	[1, 2, ]
]

feature = [
	[1, 2, 3],
]

tf_idf = TF_IDF(corpus=corpus)
tf_idf.top_similar(query=feature, top_k=2)


[[1, 2, 3], [1, 2, 3, 4]]

In [None]:
class BartFoCusDatasetSampleHyperparameters:
	def __init__(self,
			dialog_history_length: int = 1,
			context_length: int = 1,
			knowledge_length: int = 1,
		) -> None:
		"""
		Args:
			dialog_history_length (int): количество пар диалогов(назад), которые будут 
				использоваться для генерации ответа	
			context_length (int): количество предложений из диалога, относительно которых 
				будут выбираться похожие из поля knowledge
			knowledge_length (int): количество предложений из knowledge, которые будут
				подаваться на вход модели 
		"""
		self.dialog_history_length = 1
		self.context_length = 1
		self.knowledge_length = 1

class BartFoCusDatasetSampleV1:
	"""
	в этом датасете будет просто языковое моделирование
	с вставкой информацией о персоне и базы знаний + сами диалоги.
	- предложение из персоны будет вставляться только то что использовалось для генерации ответа
	- с предложением из базы знаний аналогично
	- knowledge это предложения из базы знаний отобранные при помощи tf-idf(похожие на вопрос пользователя) 
	примерно так:
		[BOS] [persona] [SEP] [knowledge] [SEP] [dialog] [SEP] 
	"""
	def __init__(self, 
			focus_dataset_sample: FoCusDatasetSampleV1,
			tokenizer: BartTokenizer,
			hyperparameters: BartFoCusDatasetSampleHyperparameters,
		) -> None:
		self.focus_dataset_sample = focus_dataset_sample
		self.tokenizer = tokenizer
		self.hyperparameters = hyperparameters

		self.bos_token_id = self.tokenizer.bos_token
		self.pad_token_id = self.tokenizer.pad_token
		self.unk_token_id = self.tokenizer.unk_token
		self.sep_token_id = self.tokenizer.sep_token
		self.cls_token_id = self.tokenizer.cls_token
	
	def __flat_list(self, list_of_lists: List[List]) -> List:
		return list(chain.from_iterable(list_of_lists))

	def get_dict(self) -> dict:
		dialog_history_length = self.hyperparameters.dialog_history_length
		context_length = self.hyperparameters.context_length
		knowledge_length = self.hyperparameters.knowledge_length

		encoded_persona = self.tokenizer.batch_encode_plus(temp['persona'], add_special_tokens=False)

		dialog_history = temp['dialog'][-2*dialog_history_length:]
		dialog_history_feature = self.tokenizer.batch_encode_plus(dialog_history[:-1], add_special_tokens=False)
		dialog_history_target = self.tokenizer.batch_encode_plus(dialog_history[-1:], add_special_tokens=False)

		true_knowledge_answer = [ temp['knowledge_candidates'][temp['knowledge_answer_index']] ]
		true_knowledge_answer = self.tokenizer.batch_encode_plus(true_knowledge_answer, add_special_tokens=False)

		knowledge_candidates = temp['knowledge_candidates']
		knowledge_candidates = self.tokenizer.batch_encode_plus(knowledge_candidates, add_special_tokens=False)

		query_context = dialog_history_feature['input_ids'][-context_length:]
		knowledge = self.tokenizer.batch_encode_plus(temp['knowledge'], add_special_tokens=False)
		
		tf_idf = TF_IDF(corpus=knowledge_candidates['input_ids'])
		most_similar_knowledge = tf_idf.top_similar(
			query=query_context,
		)
		
		# [BOS] [persona] [SEP] [knowledge] [SEP] [dialog] [SEP]
		flat_persona = self.__flat_list(encoded_persona['input_ids'])
		flat_knowledge = self.__flat_list(most_similar_knowledge)
		flat_dialog_history = self.__flat_list(dialog_history_feature['input_ids'])
		input_sequence = [
			self.bos_token_id,
			*flat_persona,
			self.sep_token_id,
			*flat_knowledge,
			self.sep_token_id,
			*flat_dialog_history,
			self.sep_token_id,
		]
		# TODO: разобраться как происходит генерация следующего токена

		

In [57]:
list(chain.from_iterable([ [1,2,3], [4,5,6] ]))

[1, 2, 3, 4, 5, 6]

In [53]:
[*[1, 2]]

[1, 2]