### explore data

In [7]:
import json

# read the json file 
with open('./datasets/persona_chat.json') as f:
    data = json.load(f)
    print(data.keys())
    for key in data.keys():
        print(key, len(data[key]))
        print(data[key][0].keys())

dict_keys(['train', 'valid'])
train 17878
dict_keys(['personality', 'utterances'])
valid 1000
dict_keys(['personality', 'utterances'])


In [17]:
data['train'][12]['personality']

['i watch basketball .',
 'i go to a local college .',
 'i work at a smoothie shop .',
 'i listen to classic rock .']

In [1]:
from dimweb_persona_bot.datasets_transformers.persona_chat_dataset_transformer import persona_chat_dataset_tranformer_v1

persona_chat_dataset_tranformer_v1(
    initial_dataset_path="./datasets/persona_chat/persona_chat.json",
    output_folder="./datasets/persona_chat",
)

Dataset lengths: train 17878, valid 500, test 500
Datasets saved.


In [27]:
# —á–µ—Ç–Ω—ã–π –æ—Ç–≤–µ—Ç –≤ history —Å–æ–¥–µ—Ä–∂–∏—Ç –ø–µ—Ä—Å–æ–Ω—É
data['train'][12]['utterances'][-1]['history']

['hey how are you today ?',
 'great ! just go off work at the smoothie shop . you ?',
 'i have been eating tacos and getting ready to move to school .',
 'are you going to college ? i go to a local one .',
 'yes , i am going to university of michigan . what year are you ?',
 'first yr ! do you have any hobbies ?',
 'i love doing anything outdoors . especially in summer . you ?',
 'i love watching college basketball and rocking out to classic rock .',
 'fun . have you decided on your major for school ?',
 'not yet . have you decided ?',
 "pre med . i'd love to be a doctor"]

### default dataset

In [32]:
from dimweb_persona_bot.dataloaders.persona_chat_dataloaders import PersonaChatDatasetV1
dataset = PersonaChatDatasetV1(
    input_dataset_path="./datasets/persona_chat/train.json",
)
dataset[12]

{'persona': ['my mom is my best friend .',
  'i have four sisters .',
  'i believe that mermaids are real .',
  'i love iced tea .'],
 'history': ['hi , how are you doing today ?',
  'i am spending time with my 4 sisters what are you up to',
  'wow , four sisters . just watching game of thrones .',
  'that is a good show i watch that while drinking iced tea',
  'i agree . what do you do for a living ?',
  "i'm a researcher i'm researching the fact that mermaids are real",
  "interesting . i'm a website designer . pretty much spend all my time on the computer .",
  "that's cool my mom does the same thing",
  "that's awesome . i have always had a love for technology .",
  'tell me more about yourself',
  'i really enjoy free diving , how about you , have any hobbies ?']}

### Causal datasets

In [4]:
from dimweb_persona_bot.dataloaders.persona_chat_dataloaders import PersonaChatDatasetV1
from dimweb_persona_bot.dataloaders.causal_samplers import CausalTrainPersonaSampleV1, CausalValidPersonaSampleV1
from dimweb_persona_bot.dataloaders.lighting import LightningDataModuleV1
from dimweb_persona_bot.hyperparameters.causal_modeling_hyperparameters import (
    PersonaChatHyperparametersV1,
)

from transformers import AutoTokenizer

hyperparameters = PersonaChatHyperparametersV1()
tokenizer = AutoTokenizer.from_pretrained(hyperparameters.model_name)

lighting_data = LightningDataModuleV1(
	train_path_dataset="./datasets/persona_chat/train.json",
	valid_path_dataset="./datasets/persona_chat/valid.json",
	hyperparameters=hyperparameters,
	tokenizer=tokenizer,
	base_train_dataset_class=PersonaChatDatasetV1,
	base_valid_dataset_class=PersonaChatDatasetV1,
	base_train_sample_class=CausalTrainPersonaSampleV1,
	base_valid_sample_class=CausalValidPersonaSampleV1,
)
lighting_data.setup()
next(iter(lighting_data.train_dataloader()))

{'input_ids': tensor([[50256, 15332,   287,  ...,  -100,  -100,  -100],
         [50256,  1820,  4004,  ...,  -100,  -100,  -100],
         [50256,    72,   588,  ...,  -100,  -100,  -100],
         ...,
         [50256,    72,  1842,  ...,  -100,  -100,  -100],
         [50256,    72,  1842,  ...,  -100,  -100,  -100],
         [50256,    72,   588,  ...,  -100,  -100,  -100]]),
 'labels': tensor([[50256, 15332,   287,  ...,  -100,  -100,  -100],
         [50256,  1820,  4004,  ...,  -100,  -100,  -100],
         [50256,    72,   588,  ...,  -100,  -100,  -100],
         ...,
         [50256,    72,  1842,  ...,  -100,  -100,  -100],
         [50256,    72,  1842,  ...,  -100,  -100,  -100],
         [50256,    72,   588,  ...,  -100,  -100,  -100]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0,

In [2]:
from dimweb_persona_bot.dataloaders.persona_chat_dataloaders import PersonaChatDatasetV1


train_dataset = PersonaChatDatasetV1(
    input_dataset_path="./datasets/persona_chat/train.json",
)

In [6]:
train_dataset[96]

{'persona': ['i like to snowboard .',
  'my favorite food is popcorn .',
  'i like to ride horses .',
  'i live in rural wisconsin .'],
 'history': ['i am frank . nice to meet you . what is your name ?',
  'my name is gary . great to meet you too .',
  'i work as a general manager at a grocery store . what about you ?',
  "i'm an insurance salesman"],
 'sample_id': '15_2'}

In [7]:
valid_dataset = PersonaChatDatasetV1(
    input_dataset_path="./datasets/persona_chat/valid.json",
)

In [8]:
valid_dataset[0]

{'persona': ['i read twenty books a year .',
  "i'm a stunt double as my second job .",
  'i only eat kosher .',
  'i was raised in a single parent household .'],
 'history': ['hello what are doing today ?',
  'i am good , i just got off work and tired , i have two jobs .'],
 'sample_id': '0_1'}

In [12]:
valid_dataset[2]['history']

['hello what are doing today ?',
 'i am good , i just got off work and tired , i have two jobs .',
 'i just got done watching a horror movie',
 "i rather read , i've read about 20 books this year .",
 'wow ! i do love a good horror movie . loving this cooler weather',
 'but a good movie is always good .']

In [17]:
valid_dataset[2]['history'][-2:]

['wow ! i do love a good horror movie . loving this cooler weather',
 'but a good movie is always good .']

## ru persona chat

In [4]:
import pandas as pd

dataset = pd.read_csv("./datasets/ru_persona_chat/profiles.tsv", delimiter="\t")
dataset.head()

Unnamed: 0,characteristic_1,characteristic_2,characteristic_3,characteristic_4,characteristic_5
–£ –º–µ–Ω—è –ª—é–±–∏–º–∞—è —Ä–∞–±–æ—Ç–∞.,–Ø —É–≤–∞–∂–∞—é –ª—é–¥–µ–π.,–£ –º–µ–Ω—è –µ—Å—Ç—å –∂–∏–≤–æ—Ç–Ω–æ–µ.,–£ –º–µ–Ω—è —Ö–æ—Ä–æ—à–∏–π –¥—Ä—É–≥.,–Ø –ª—é–±–ª—é –∫–æ—Ñ–µ.,
–Ø —Ä–∞–±–æ—Ç–∞—é —É—á–∏—Ç–µ–ª–µ–º,–£ –º–µ–Ω—è –µ—Å—Ç—å —Å–æ–±–∞–∫–∞,–Ø –ª—é–±–ª—é –ø–µ—Ç—å,–Ø –∂–∏–≤—É —Å–∞–º–∞,–Ø –ª—é–±–ª—é —Ü–≤–µ—Ç—ã,
–Ø –∫—É–ø–∏–ª–∞ –¥–æ–º,–Ø –±–µ–≥–∞—é –ø–æ —É—Ç—Ä–∞–º,–Ø —Ä–∞–±–æ—Ç–∞—é –Ω–∞ —Ä–∞–±–æ—Ç–µ,–Ø –ø–æ–µ–¥—É –≤ –æ—Ç–ø—É—Å–∫,–Ø –ª—é–±–ª—é –∞—Ä–±—É–∑,
—è –≤—Ä–∞—á –∏ –∂–µ–Ω–∞—Ç,—É –º–µ–Ω—è —Ç—Ä–æ–µ –¥–µ—Ç–µ–π,–Ω–µ –ª—é–±–ª—é —Å–≤–æ—é —Ä–∞–±–æ—Ç—É,–Ω—Ä–∞–≤–∏—Ç—å—Å—è –µ–∑–¥–∏—Ç—å –Ω–∞ –≤–µ–ª–æ—Å–∏–ø–µ–¥–µ,–ª—é–±–ª—é –ø–∏–≤–æ,
–Ø —à–∫–æ–ª—å–Ω–∏—Ü–∞.,–Ø –µ—â—ë —É—á—É—Å—å.,–ù–æ —è –º–µ—á—Ç–∞—é —Ä–∞–±–æ—Ç–∞—Ç—å.,–Ø –æ–±–æ–∂–∞—é —Ä–æ–¥–∏—Ç–µ–ª–µ–π.,–ò –Ω–µ –ª—é–±–ª—é —É—á–∏—Ç—å—Å—è.,


In [5]:
dataset = pd.read_csv("./datasets/ru_persona_chat/dialogues.tsv", delimiter="\t")
dataset.head()

Unnamed: 0,persona_1_profile,persona_2_profile,dialogue
0,<span class=participant_1>–£ –º–µ–Ω—è –ª—é–±–∏–º–∞—è —Ä–∞–±–æ—Ç...,<span class=participant_2>–ò—â—É –ø—Ä–∏–Ω—Ü–∞.<br />–í–µ–¥...,<span class=participant_2>–ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 2: –ü—Ä–∏–≤...
1,<span class=participant_1>–Ø —Ä–∞–±–æ—Ç–∞—é —É—á–∏—Ç–µ–ª–µ–º<b...,<span class=participant_2>–Ø –±–∏–∑–Ω–µ—Å–º–µ–Ω<br />–£ –º...,<span class=participant_1>–ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 1: –ü—Ä–∏–≤...
2,<span class=participant_1>–Ø –∫—É–ø–∏–ª–∞ –¥–æ–º<br />–Ø ...,<span class=participant_2>–Ø –ø–æ—é –≤ –∫–∞—Ä–∞–æ–∫–µ<br /...,<span class=participant_1>–ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 1: –ü—Ä–∏–≤...
3,<span class=participant_1>—è –≤—Ä–∞—á –∏ –∂–µ–Ω–∞—Ç<br />...,<span class=participant_2>–Ø –º–∞–ª—å—á–∏–∫<br />–Ø —É—á—É...,<span class=participant_2>–ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 2: –ó–¥—Ä–∞...
4,<span class=participant_1>–Ø —à–∫–æ–ª—å–Ω–∏—Ü–∞.<br />–Ø ...,<span class=participant_2>–Ø –ø—Ä–æ—Å—Ç–æ–≤–∞—Ç.<br />–õ—é...,<span class=participant_1>–ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 1: –ü—Ä–∏–≤...


In [6]:
dataset.iloc[4]['persona_1_profile']

'<span class=participant_1>–Ø —à–∫–æ–ª—å–Ω–∏—Ü–∞.<br />–Ø –µ—â—ë —É—á—É—Å—å.<br />–ù–æ —è –º–µ—á—Ç–∞—é —Ä–∞–±–æ—Ç–∞—Ç—å.<br />–Ø –æ–±–æ–∂–∞—é —Ä–æ–¥–∏—Ç–µ–ª–µ–π.<br />–ò –Ω–µ –ª—é–±–ª—é —É—á–∏—Ç—å—Å—è.<br /></span>'

In [7]:
dataset.iloc[4]['persona_2_profile']

'<span class=participant_2>–Ø –ø—Ä–æ—Å—Ç–æ–≤–∞—Ç.<br />–õ—é–¥–∏ –∏–∑–±–µ–≥–∞—é—Ç –º–µ–Ω—è.<br />–Ø –±—ã—Å—Ç—Ä–æ –±–µ–≥–∞—é.<br />–ú–æ–∏ —É–≤–ª–µ—á–µ–Ω–∏—è –Ω–µ–æ—Ä–¥–∏–Ω–∞—Ä–Ω—ã.<br />–Ø —Ä–∞–±–æ—Ç–∞—é –ø–æ –ø—Ä–∏–∑–≤–∞–Ω–∏—é.<br /></span>'

In [8]:
dataset.iloc[4]['dialogue']


'<span class=participant_1>–ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 1: –ü—Ä–∏–≤–µ—Ç!</span><br /><span class=participant_2>–ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 2: –ü—Ä–∏–≤–µ—Ç!</span><br /><span class=participant_2>–ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 2: –ö–∞–∫ —Ç–≤–æ–∏ –¥–µ–ª–∞?</span><br /><span class=participant_1>–ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 1: –ù–æ—Ä–º–∞–ª—å–Ω–æ, –≥–æ—Ç–æ–≤–ª—é—Å—å –∫–æ —Å–Ω—É. –ó–∞–≤—Ç—Ä–∞ —Å–Ω–æ–≤–∞ –≤ —à–∫–æ–ª—É<br />. –ù–µ –ª—é–±–ª—é —É—á–∏—Ç—å—Å—è.</span><br /><span class=participant_1>–ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 1: –ê —Ç–≤–æ–∏ –∫–∞–∫?</span><br /><span class=participant_2>–ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 2: –í—Å—ë —Ö–æ—Ä–æ—à–æ,—Å–ø–∞—Ç—å –Ω–µ —Ö–æ—á–µ—Ç—Å—è,–¥—É–º–∞—é —Ñ–∏–ª—å–º –ø–æ—Å–º–æ—Ç—Ä–µ—Ç—å</span><br /><span class=participant_1>–ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 1: –ö–∞–∫–æ–π —Ñ–∏–ª—å–º?</span><br /><span class=participant_2>–ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 2: –ï—â—ë –Ω–µ —Ä–µ—à–∏–ª–∞, –º–æ–∂–µ—Ç –±—ã—Ç—å –¥–µ—Ç–µ–∫—Ç–∏–≤ –∫–∞–∫–æ–π –Ω–∏–±—É–¥—å. –ê<br />–∫–∞–∫ –≤ —à–∫–æ–ª–µ —É —Ç–µ–±—è?</span><br /><span class=participa

In [9]:
dialogue = dataset.iloc[4]['dialogue']

In [10]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(dialogue)
print(soup.prettify())

<span class="participant_1">
 –ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 1: –ü—Ä–∏–≤–µ—Ç!
</span>
<br/>
<span class="participant_2">
 –ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 2: –ü—Ä–∏–≤–µ—Ç!
</span>
<br/>
<span class="participant_2">
 –ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 2: –ö–∞–∫ —Ç–≤–æ–∏ –¥–µ–ª–∞?
</span>
<br/>
<span class="participant_1">
 –ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 1: –ù–æ—Ä–º–∞–ª—å–Ω–æ, –≥–æ—Ç–æ–≤–ª—é—Å—å –∫–æ —Å–Ω—É. –ó–∞–≤—Ç—Ä–∞ —Å–Ω–æ–≤–∞ –≤ —à–∫–æ–ª—É
 <br/>
 . –ù–µ –ª—é–±–ª—é —É—á–∏—Ç—å—Å—è.
</span>
<br/>
<span class="participant_1">
 –ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 1: –ê —Ç–≤–æ–∏ –∫–∞–∫?
</span>
<br/>
<span class="participant_2">
 –ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 2: –í—Å—ë —Ö–æ—Ä–æ—à–æ,—Å–ø–∞—Ç—å –Ω–µ —Ö–æ—á–µ—Ç—Å—è,–¥—É–º–∞—é —Ñ–∏–ª—å–º –ø–æ—Å–º–æ—Ç—Ä–µ—Ç—å
</span>
<br/>
<span class="participant_1">
 –ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 1: –ö–∞–∫–æ–π —Ñ–∏–ª—å–º?
</span>
<br/>
<span class="participant_2">
 –ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 2: –ï—â—ë –Ω–µ —Ä–µ—à–∏–ª–∞, –º–æ–∂–µ—Ç –±—ã—Ç—å –¥–µ—Ç–µ–∫—Ç–∏–≤ –∫–∞–∫–æ–π –Ω–∏–±—É–¥—å. –ê
 <br/>
 –∫–∞–∫ –≤ —à–∫–æ

In [74]:
profile = dataset.iloc[0]['persona_2_profile']
soup2 = BeautifulSoup(profile)
[item + "." for item in soup2.text.split(".") if item]

['–ò—â—É –ø—Ä–∏–Ω—Ü–∞.',
 '–í–µ–¥—É –∞–∫—Ç–∏–≤–Ω—ã–π –æ–±—Ä–∞–∑ –∂–∏–∑–Ω–∏.',
 '–õ—é–±–ª—é —á–∏—Ç–∞—Ç—å –∫–ª–∞—Å—Å–∏–∫—É.',
 '–í—ã—Ä–∞—â–∏–≤–∞—é —Ñ–∏–∞–ª–∫–∏.',
 '–õ—é–±–ª—é –æ–±—â–µ–Ω–∏–µ.']

In [22]:
len(soup.find_all('span'))

11

In [17]:
soup.find_all('span')[0].text

'–ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 2: –ü—Ä–∏–≤–µ—Ç) —Ä–∞—Å—Å–∫–∞–∂–∏ –æ —Å–µ–±–µ'

In [21]:
soup.find_all('span')[0].get('class')

['participant_2']

In [21]:
"".join([str(item) for item in soup.find_all('span')[3].contents])

'–ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 1: –ù–æ—Ä–º–∞–ª—å–Ω–æ, –≥–æ—Ç–æ–≤–ª—é—Å—å –∫–æ —Å–Ω—É. –ó–∞–≤—Ç—Ä–∞ —Å–Ω–æ–≤–∞ –≤ —à–∫–æ–ª—É<br/>. –ù–µ –ª—é–±–ª—é —É—á–∏—Ç—å—Å—è.'

In [11]:
from typing import TypedDict

class Replica(TypedDict):
    text: str
    persona_class: str

dialogue = []
replicas = soup.find_all('span')
current_class = replicas[0].get('class')[0]

def simple_filter(text: str) -> str:
    text = text.replace("–ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 1:", "")
    text = text.replace("–ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 2:", "")
    return text

current_text = ""
for replica in replicas:
    if replica.get('class')[0] == current_class:
        current_text += simple_filter(replica.text)
    else:
        replica_obj = Replica(text=current_text, persona_class=current_class)
        dialogue.append(replica_obj)
        current_class = replica.get('class')[0]
        current_text = simple_filter(replica.text)
        

for item in dialogue:
    print(item['persona_class'], item['text'])

participant_1  –ü—Ä–∏–≤–µ—Ç!
participant_2  –ü—Ä–∏–≤–µ—Ç! –ö–∞–∫ —Ç–≤–æ–∏ –¥–µ–ª–∞?
participant_1  –ù–æ—Ä–º–∞–ª—å–Ω–æ, –≥–æ—Ç–æ–≤–ª—é—Å—å –∫–æ —Å–Ω—É. –ó–∞–≤—Ç—Ä–∞ —Å–Ω–æ–≤–∞ –≤ —à–∫–æ–ª—É. –ù–µ –ª—é–±–ª—é —É—á–∏—Ç—å—Å—è. –ê —Ç–≤–æ–∏ –∫–∞–∫?
participant_2  –í—Å—ë —Ö–æ—Ä–æ—à–æ,—Å–ø–∞—Ç—å –Ω–µ —Ö–æ—á–µ—Ç—Å—è,–¥—É–º–∞—é —Ñ–∏–ª—å–º –ø–æ—Å–º–æ—Ç—Ä–µ—Ç—å
participant_1  –ö–∞–∫–æ–π —Ñ–∏–ª—å–º?
participant_2  –ï—â—ë –Ω–µ —Ä–µ—à–∏–ª–∞, –º–æ–∂–µ—Ç –±—ã—Ç—å –¥–µ—Ç–µ–∫—Ç–∏–≤ –∫–∞–∫–æ–π –Ω–∏–±—É–¥—å. –ê–∫–∞–∫ –≤ —à–∫–æ–ª–µ —É —Ç–µ–±—è?
participant_1  –•–æ—Ä–æ—à–æ, –µ—â—ë —É—á—É—Å—å, –Ω–æ —Å–∫–æ—Ä–æ –∑–∞–∫–æ–Ω—á—É. –£–∂–µ –º–µ—á—Ç–∞—é —Ä–∞–±–æ—Ç–∞—Ç—å, –∞ –Ω–µ —Å–∏–¥–µ—Ç—å –∑–∞ —É—á–µ–±–Ω–∏–∫–∞–º–∏. –ê —Ç—ã —Ä–∞–±–æ—Ç–∞–µ—à—å –∏–ª–∏ —É—á–∏—à—å—Å—è–µ—â—ë?
participant_2  –ê —è —Ä–∞–±–æ—Ç–∞—é, –º–Ω–µ –Ω—Ä–∞–≤–∏—Ç—Å—è –º–æ—è —Ä–∞–±–æ—Ç–∞, –∫–µ–º –ø–ª–∞–Ω–∏—Ä—É–µ—à—å—Ä–∞–±–æ—Ç–∞—Ç—å?
participant_1  –•–æ—á—É –±—ã—Ç—å –ø—Å–∏—Ö–æ–ª–æ–≥–æ–º. –ê –∫–µ–º —Ç—ã —Ä–∞–±–æ—Ç–∞–µ—à—å?
participant_2  –ù–µ –ø–æ–≤–µ—Ä–∏—

In [31]:
len(dialogue)

7

In [40]:
dialogue_samples = []

dialogue_len = len(dialogue) // 2 
for i in range(dialogue_len):
	sample = dialogue[:i*2]

In [38]:
[1, 2, 3, 4, 5][:4]

[1, 2, 3, 4]

In [58]:
pd.DataFrame(dialogue).to_dict('r')

  pd.DataFrame(dialogue).to_dict('r')


[{'text': ' –ü—Ä–∏–≤–µ—Ç) —Ä–∞—Å—Å–∫–∞–∂–∏ –æ —Å–µ–±–µ', 'persona_class': 'participant_2'},
 {'text': ' –ü—Ä–∏–≤–µ—Ç) –ø–æ–¥ –≤–∫—É—Å–Ω—ã–π –∫–æ—Ñ–µ–µ–∫ –Ω–∞—Å—Ç—Ä–æ–µ–Ω–∏–µ –ø–æ–±–æ–ª—Ç–∞—Ç—å –ø–æ—è–≤–∏–ª–æ—Å—å)',
  'persona_class': 'participant_1'},
 {'text': ' –ß—Ç–æ —á–∏—Ç–∞–µ—à—å? –ú–Ω–µ –Ω—Ä–∞–≤–∏—Ç—Å—è –∫–ª–∞—Å—Å–∏–∫–∞ –Ø —Ç–æ–∂–µ –ª—é–±–ª—é –ø–æ–æ–±—â–∞—Ç—å—Å—è',
  'persona_class': 'participant_2'},
 {'text': ' –õ—é–±–ª—é –∂–∏–≤–æ—Ç–Ω—ã—Ö, –ø—Ä–æ—Å—Ç–æ –æ–±–æ–∂–∞—é, –∫–∞–∫ –∏ —Å–≤–æ—é —Ä–∞–±–æ—Ç—É) –Ø —Ñ–∞–Ω—Ç–∞—Å—Ç–∏–∫—É –ª—é–±–ª—é',
  'persona_class': 'participant_1'},
 {'text': ' –ê —è –≤—ã—Ä–∞—â–∏–≤–∞—é —Ñ–∏–∞–ª–∫–∏ –ò –≤–µ–¥—É –∑–¥–æ—Ä–æ–≤—ã–π –∏ –∞–∫—Ç–∏–≤–Ω—ã–π –æ–±—Ä–∞–∑ –∂–∏–∑–Ω–∏!',
  'persona_class': 'participant_2'},
 {'text': ' –£—Ö —Ç—ã, –∏–Ω—Ç–µ—Ä–µ—Å–Ω–æ.', 'persona_class': 'participant_1'},
 {'text': ' –¢—ã —Å–ª—É—á–∞–π–Ω–æ –Ω–µ –ø—Ä–∏–Ω—Ü –Ω–∞ –±–µ–ª–æ–º –∫–æ–Ω–µ? –Ø –µ–≥–æ –æ—á–µ–Ω—å –∂–¥—É..',
  'persona_class': 'participant_2'}]

In [2]:
from dimweb_persona_bot.dataloaders.ru_persona_chat_dataloaders import RUPersonaChatDatasetV1
from dimweb_persona_bot.datasets_transformers.ru_persona_chat_dataset_transformer import ru_persona_chat_dataset_tranformer_v1

# ru_persona_chat_dataset_tranformer_v1(
# 	initial_dataset_path="./datasets/ru_persona_chat/dialogues.tsv",
# 	output_folder="./datasets/ru_persona_chat",
# )

train_dataset = RUPersonaChatDatasetV1(
    input_dataset_path="./datasets/ru_persona_chat/valid.csv",
)
train_dataset[140]

{'persona': ['–Ø —é—Ä–∏—Å—Ç.',
  '–ù–µ –∑–∞–º—É–∂–µ–º.',
  '–õ—é–±–ª—é —Ç–∞–Ω—Ü–µ–≤–∞—Ç—å, –ø–µ—Ç—å.',
  '–ú–æ–µ —Ö–æ–±–±–∏ –∫—É–ª–∏–Ω–∞—Ä–∏—è.',
  '–Ø –ª—é–±–ª—é –ª–µ—Ç–æ, –º–æ—Ä–µ, —Å–æ–ª–Ω—Ü–µ –∏ –ø–µ—Å–æ–∫.'],
 'history': ['–ü—Ä–∏–≤–µ—Ç.',
  '–ø—Ä–∏–≤–µ—Ç!',
  '–ö–∞–∫ —Ç–µ–±—è –∑–æ–≤—É—Ç? –ò —á–µ–º —Ç—ã –∑–∞–Ω–∏–º–∞–µ—à—å—Å—è?',
  '–∞–Ω–Ω–∞ —è —é—Ä–∏—Å—Ç. —è –ª—é–±–ª—é —Ç–æ–Ω—Ü–µ–≤–∞—Ç—å –∏ –ø–µ—Ç—å. –∞ —Ç—ã —á–µ–º –ª—é–±–∏—à—å –∑–∞–Ω–∏–º–∞—Ç—å—Å—è ?',
  '–Ø –û–ª—å–≥–∞! –î–æ–º–æ—Ö–æ–∑—è–π–∫–∞ –∏ –∫–æ—Å–º–µ—Ç–æ–ª–æ–≥ –ø–æ —Å–æ–≤–º–µ—Å—Ç–∏—Ç–µ–ª—å—Å—Ç–≤—É üòÑ –ü–µ—á—å –ø–∏—Ä–æ–≥–∏ –∏ –≤—Å—ë —Ç–∞–∫–æ–µ) –¢—ã –æ—Ç –∫—É–¥–∞? –õ—é–±–∏—à—å –∂–∏–≤–æ—Ç–Ω—ã—Ö?',
  '–±—Ä—è–Ω—Å–∫. –∞ —Ç—ã? –º–æ—ë —Ö–æ–±–±–∏ –∫—É–ª–∏–Ω–æ—Ä–∏—è. –¥–∞ –∫–æ–Ω–µ—à–Ω–æ. –º—ã –¥–∞–∂–µ —Å –º—É–∂–∞–º —Ä–∞–∑–≤–µ–ª–∏—Å—å –∏–∑ –∑–∞ –∫–æ—à–∫–∏ ) —Ç–µ—Ä–µ—Ä—å —è –Ω–µ –∑–∞–º—É–∂–µ–º)'],
 'sample_id': '23_3'}

In [4]:
train_dataset[160]

{'persona': ['–Ø –∂–µ–Ω–∞—Ç.',
  '–Ø —Ä–∞–±–æ—Ç–∞—é –≤ –∞–≤—Ç–æ—Å–∞–ª–æ–Ω–µ.',
  '–£ –º–µ–Ω—è –µ—Å—Ç—å –±–æ–ª—å—à–æ–π –¥–æ–º.',
  '–Ø –º–µ—á—Ç–∞—é –æ –¥–µ—Ç—è—Ö.',
  '–£ –º–µ–Ω—è –µ—Å—Ç—å —Å–≤–æ—è —Ñ–µ—Ä–º–∞.'],
 'history': ['–ü—Ä–∏–≤–µ—Ç!', '–ü—Ä–∏–≤–µ—Ç.'],
 'sample_id': '27_1'}

In [4]:
from bs4 import BeautifulSoup
import re
persona = '<span class="participant_1">—Ö–æ—á—É –∫–æ—à–∫—É<br/>–º–µ—á—Ç–∞—é –ø—Ä—ã–≥–Ω—É—Ç—å —Å –ø–∞—Ä–∞—à—é—Ç–æ–º<br/>–Ω–∞—É—á–∏–ª —Ä–∞–∑–≥–æ–≤–∞—Ä–∏–≤–∞—Ç—å –ø–æ–ø—É–≥–∞—è<br/>–ª—é–±–ª—é –ø—Ä–∏—Ä–æ–¥—É<br/>–æ–±–æ–∂–∞—é –±–∞–Ω—é<br/></span> '
persona = '<span class="participant_2">–Ø –ø–µ—Ä–µ–≤–æ–¥—á–∏–∫.<br/>–Ø —Ä–∞–∑–≤–µ–¥–µ–Ω.<br/>–£ –º–µ–Ω—è –∫–∞—Ä–∏–µ –≥–ª–∞–∑–∞.<br/>–Ø –∏–≥—Ä–∞—é –Ω–∞ –±–∞—è–Ω–µ.<br/>–£ –º–µ–Ω—è –µ—Å—Ç—å –¥–∞—á–∞.<br/></span> '
# persona = persona.replace("<br/>", ". ")

# soup = BeautifulSoup(
# 	persona,
# 	features="html.parser",
# )
# soup.text
re.sub(r"<span.*\">|</span>", "", persona)

'–Ø –ø–µ—Ä–µ–≤–æ–¥—á–∏–∫.<br/>–Ø —Ä–∞–∑–≤–µ–¥–µ–Ω.<br/>–£ –º–µ–Ω—è –∫–∞—Ä–∏–µ –≥–ª–∞–∑–∞.<br/>–Ø –∏–≥—Ä–∞—é –Ω–∞ –±–∞—è–Ω–µ.<br/>–£ –º–µ–Ω—è –µ—Å—Ç—å –¥–∞—á–∞.<br/> '

In [2]:
train_dataset[54]

{'persona': ['<span class="participant_1">—Ö–æ—á—É –∫–æ—à–∫—É<br/>–º–µ—á—Ç–∞—é –ø—Ä—ã–≥–Ω—É—Ç—å —Å –ø–∞—Ä–∞—à—é—Ç–æ–º<br/>–Ω–∞—É—á–∏–ª —Ä–∞–∑–≥–æ–≤–∞—Ä–∏–≤–∞—Ç—å –ø–æ–ø—É–≥–∞—è<br/>–ª—é–±–ª—é –ø—Ä–∏—Ä–æ–¥—É<br/>–æ–±–æ–∂–∞—é –±–∞–Ω—é<br/></span> '],
 'history': ['–ü—Ä–∏–≤–µ—Ç.',
  '–ü—Ä–∏–≤–µ—Ç.',
  '–ö–∞–∫ —Ç–µ–±—è –∑–æ–≤—É—Ç? –ß–µ–º –∑–∞–Ω–∏–º–∞–µ—à—å—Å—è –ø–æ –∂–∏–∑–Ω–∏?',
  '–ú–∞—Ä–∏—è. –ñ–∏–≤—É –≤ –¥–µ—Ä–µ–≤–Ω–µ, –¥–µ—Ä–∂—É –ø–æ–ø—É–≥–∞—è –∏ —Ö–æ—á—É –∫–æ—à–∫—É.',
  '–ö–∞–∫ –∑–¥–æ—Ä–æ–≤–æ, —è –±–µ–∑—É–º–Ω–æ –ª—é–±–ª—é –∂–∏–≤–æ—Ç–Ω—ã—Ö. –£ –º–µ–Ω—è –µ—Å—Ç—å —Å–æ–±–∞–∫–∏ –∏ –ø–æ–ø—É–≥–∞–∏.',
  '–ö–∞–∫–∏–µ —Å–æ–±–∞–∫–∏ –∏ —Å–∫–æ–ª—å–∫–æ?',
  '–Ø –ª—é–±–ª—é –∏ –∫–æ—Ç–∏–∫–æ–≤,–Ω–æ –ø–æ–∫–∞ –Ω–µ –∑–∞–≤–µ–ª–∞. –¢—Ä–∏ —Å–æ–±–∞–∫–∏,–¥–≤–∞ –ª–∞–±—Ä–∞–¥–æ—Ä–∞ –∏ —á–∞—É-—á–∞—É. –õ—é–±–ª—é –ø—É—à–∏—Å—Ç–∏–∫–æ–≤)',
  '–Ø –ª—é–±–ª—é –ø—Ä–∏—Ä–æ–¥—É –∏ –∂–∏–≤–æ—Ç–Ω—ã—Ö.',
  '–Ø —Ç–æ–∂–µ.',
  '–õ–∞–±—Ä–∞–¥–æ—Ä—ã –æ—á–µ–Ω—å –¥–æ–±—Ä—ã–µ —Å–æ–±–∞–∫–∏.'],
 'sample_id': '10_5'}

In [15]:
from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50", )

len(tokenizer.encode(' '.join(train_dataset[14]['persona'])))

34

In [13]:
train_dataset[67]

{'persona': ['<span class="participant_2">–Ø –ø–µ—Ä–µ–≤–æ–¥—á–∏–∫.<br/>–Ø —Ä–∞–∑–≤–µ–¥–µ–Ω.<br/>–£ –º–µ–Ω—è –∫–∞—Ä–∏–µ –≥–ª–∞–∑–∞.<br/>–Ø –∏–≥—Ä–∞—é –Ω–∞ –±–∞—è–Ω–µ.<br/>–£ –º–µ–Ω—è –µ—Å—Ç—å –¥–∞—á–∞.<br/></span> '],
 'history': ['–ü—Ä–∏–≤–µ—Ç.', '–ü—Ä–∏–≤–µ—Ç —Ç—ã –∫—Ç–æ?'],
 'sample_id': '12_1'}

In [4]:
from dimweb_persona_bot.utils import TextEvaluator

t_eval = TextEvaluator()

t_eval.evaluate(
	[""],
	[""],
)

EOFError: No valid references for a sentence!