In [1]:
import torch
import wandb
from datetime import datetime
import shutil
import importlib
import os
from importlib.machinery import SourceFileLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
import random
from torch.nn.utils.rnn import pad_sequence
from torch import nn

from GPUtil import showUtilization as gpu_usage
from numba import cuda

In [2]:
from transformers import (
    MODEL_WITH_LM_HEAD_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    AutoModel,
    GPT2LMHeadModel,
    AutoModelForCausalLM,
    get_linear_schedule_with_warmup,
)

In [3]:
SPECIAL_TOKENS = { 
    "<sp_1>": "<sp_1>",
    "</sp_1>": "</sp_1>",
    "<sp_2>": "<sp_2>",
    "</sp_2>": "</sp_2>",
    "<persona>": "<persona>",
    "</persona>": "</persona>",
}
config = AutoConfig.from_pretrained("microsoft/DialoGPT-medium")
# config.n_positions = 512 
# config.n_embd = 1024
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
tokenizer.add_tokens(list(SPECIAL_TOKENS.values()), special_tokens=True)

6

In [4]:
class BaseExperiment:
    def __init__(self, 
        model=None, 
        tokenizer=None, 
        dataloader_train=None,
        dataloader_valid=None,
        dataloader_test=None,
        loss_func_class=None,
        estimate_func_class=None,
        experiment_config=None,
        optimizer_class=None,
        sheduler_class=None,
        project_name=None,
        notebook_name=None,
        name_run="",
        model_description="",
        do_unit_tests=True,
        pretrained_model_name=None
        ): 
        assert notebook_name != None, f"notebook_name should be valid filename, but get {notebook_name}"

        # datasets
        self.dataloader_train = dataloader_train
        self.dataloader_valid = dataloader_valid
        self.dataloader_test = dataloader_test
        
        # wandb
        self.notebook_name = notebook_name
        self.project_name = project_name 
        self.experiment_config = experiment_config
        self.wandb_run = None
        self.name_run = name_run
        self.model_description = model_description
        self.model_name = "pytorch_model"
        self.pure_model_name = "pytorch_model"
        self.model_artifact = None
        self.pretrained_model_name = pretrained_model_name

        self.optimizer_class = optimizer_class
        self.sheduler_class = sheduler_class
        self.loss_func_class = loss_func_class
        self.estimate_func_class = estimate_func_class

        self.model = model
        self.tokenizer = tokenizer
        self.optimizer = None
        self.sheduler = None
        self.loss_func = None
        self.estimate_func = None
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        # self.device = torch.device('cpu')
        print(f"Using device {self.device}")

        # prepare for experiment
        self.setup()
        if do_unit_tests:
            self.unit_tests()

    def setup(self):
        self.model.to(self.device)
        # - Freeze selective layers:
        # - Freeze all layers except last n:
        if self.experiment_config['freeze_layers'] > 0:
            for parameter in self.model.parameters():
                parameter.requires_grad = False

            for i, m in enumerate(self.model.transformer.h):        
                #Only un-freeze the last n transformer blocks
                if i+1 > 12 - self.experiment_config['freeze_layers']:
                    for parameter in m.parameters():
                        parameter.requires_grad = True 

            for parameter in self.model.transformer.ln_f.parameters():        
                parameter.requires_grad = True

            for parameter in self.model.lm_head.parameters():        
                parameter.requires_grad = True
        if self.experiment_config['do_weight_decay']:
            # Prepare optimizer and schedule (linear warmup and decay)
            no_decay = ["bias", "LayerNorm.weight"]
            optimizer_grouped_parameters = [
                {
                    "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                    "weight_decay": self.experiment_config['weight_decay'],
                },
                {"params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
            ]
            self.optimizer = self.optimizer_class(optimizer_grouped_parameters, **self.experiment_config['optimizer'])
        else:
            self.optimizer = self.optimizer_class(self.model.parameters(), **self.experiment_config['optimizer'])

        if self.sheduler_class != None:
            # num_training_steps = len(self.dataloader_train) // self.experiment_config["sheduler"] * self.experiment_config['epochs']
            self.sheduler = self.sheduler_class(
                self.optimizer, 
                **self.experiment_config['sheduler']
                )

        self.loss_func = self.loss_func_class()
        self.estimate_func = self.estimate_func_class()

        # setup wandb
        # save model structure and weights to wandb
        self.model_artifact = wandb.Artifact(
            self.name_run, type="model",
            description=self.model_description,
            metadata=self.experiment_config)


    def get_date(self):
        now = datetime.now()
        date_time = now.strftime("%m_%d_%Y__%H:%M:%S")
        return date_time

    def unit_tests(self):
        # test training
        X, y = next(iter(self.dataloader_train))
        X, y = X.to(self.device), y.to(self.device)

        loss = self.model(X, labels=X).loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # test valid
        X, y = next(iter(self.dataloader_valid))
        X, y = X.to(self.device), y.to(self.device)
        test_loss = self.model(X, labels=X).loss

        # initial validation
        self.model.eval()
        test_loss, correct = 0, 0
        num_batches = len(self.dataloader_valid)
        size = len(self.dataloader_valid.dataset)

        with torch.no_grad():
            for X, y in self.dataloader_valid:
                X, y = X.to(self.device), y.to(self.device)
                valid_loss = self.model(X, labels=X).loss
                test_loss += valid_loss
                break

        test_loss /= num_batches

        print("tests ok")


    def train(self):
        with wandb.init(project=self.project_name, entity="dimweb",
                        settings=wandb.Settings(
                            ),
                        name=self.name_run,
                        config=self.experiment_config,
                        ) as run:

            self.run = run

            # start train
            epochs = self.experiment_config['epochs']
            for i in range(epochs):
                print(f"Epoch: {i}")
                self.train_steps()
                self.valid_steps()
            
            # sync model
            self.wandb_save_model()
            
            print(f"train end")
    
    def save_model_class(self):
        # save class
        model_class_name = self.experiment_config['model_class_name']
        class_script_path_dest = f"{os.path.join(wandb.run.dir, model_class_name)}.py"
        class_script_path_src = f"./models/{model_class_name}.py"
        shutil.copy2(class_script_path_src, class_script_path_dest)
        self.model_artifact.add_file(class_script_path_dest)
        wandb.save(class_script_path_dest)

    def wandb_save_model(self):

        saved_path = str(wandb.run.dir).replace("/files", "_local")
        self.model.save_pretrained(saved_path)
        self.tokenizer.save_pretrained(saved_path)

        self.experiment_config['saved_path'] = saved_path 

        # save notebook
        notebook_path = os.path.join(wandb.run.dir, self.notebook_name)
        shutil.copy2(self.notebook_name, notebook_path)
        self.model_artifact.add_file(notebook_path)
        wandb.save(notebook_path)

        wandb.log_artifact(self.model_artifact)
    
    def train_steps(self):
        raise NotImplementedError("You need specify training steps")

    def valid_steps(self):
        raise NotImplementedError("You need specify valid steps")
    
    def load_model(self, artifact_name=""):
        with wandb.init(project="gpt_persona_bot", job_type="inference"):
            model_artifact = wandb.use_artifact(artifact_name)
            model_config = model_artifact.metadata
            model_folder = model_config['saved_path'] 
            self.model = AutoModelForCausalLM.from_pretrained(model_folder)
            self.tokenizer = AutoTokenizer.from_pretrained(model_folder)
            self.model.to(self.device)
            self.free_gpu_cache()
    
    def free_gpu_cache(self):
        print("Initial GPU Usage")
        gpu_usage()                             

        torch.cuda.empty_cache()

        cuda.select_device(0)
        cuda.close()
        cuda.select_device(0)

        print("GPU Usage after emptying the cache")
        gpu_usage()

    def test(self, artifact_name=""):
        raise NotImplementedError("You need specify test steps")


class Experiment(BaseExperiment):
    def __init__(self, **kwargs): 
        super(Experiment, self).__init__(**kwargs)
    
    def train_steps(self):
        self.model.train()
        interval = self.experiment_config['check_interval']
        
        for batch, (X, y) in enumerate(self.dataloader_train):
            # Send data to training device
            X, y = X.to(self.device), y.to(self.device)
            
            # Compute prediction error
            loss = self.model(X, labels=X).loss
            perplexity = torch.exp(torch.tensor(loss.item()))
            # Backpropagation
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            
            if self.sheduler != None:
                self.sheduler.step()
            
            # Progress output
            if batch % interval == 0:
                wandb.log({"train_loss": loss.item()})
                wandb.log({"train_perplexity": perplexity})

    def valid_steps(self):
        self.model.eval()
        test_loss, correct = 0, 0
        num_batches = len(self.dataloader_valid)
        size = len(self.dataloader_valid.dataset)

        with torch.no_grad():
            for X, y in self.dataloader_valid:
                X, y = X.to(self.device), y.to(self.device)
                loss = self.model(X, labels=y).loss
                test_loss += loss.item()

        test_loss /= num_batches
        # correct /= size
        perplexity = torch.exp(torch.tensor(test_loss))
        
        wandb.log({"val_loss": test_loss})
        wandb.log({"valid_perplexity": perplexity})
    
    @staticmethod
    def last_index(array, elem):
        return len(array) - 1 - array[::-1].index(elem)

    @staticmethod
    def test(artifact_name="", persona="", user_inputs=None, interact=False, cuda=False):
        with wandb.init(project="gpt_persona_bot", job_type="inference"):
            model_folder = ""
            if ':' in artifact_name:
                model_artifact = wandb.use_artifact(artifact_name)
                model_dir = model_artifact.download()
                model_config = model_artifact.metadata
                model_folder = model_config['saved_path'] 
            else:
                model_folder = artifact_name
            print(model_folder)
            model = AutoModelForCausalLM.from_pretrained(model_folder)
            model.eval()
            tokenizer = AutoTokenizer.from_pretrained(model_folder)
            device = torch.device("cuda") if cuda else torch.device("cpu")
            model = model.to(device)
            print("Start conversation")
            print(f"Persona: {persona}")
            persona = f"{SPECIAL_TOKENS['<persona>']}{persona}{SPECIAL_TOKENS['</persona>']}"
            # persona = f"{SPECIAL_TOKENS['<persona>']}{persona}"
            persona_ids = tokenizer.encode(persona, return_tensors='pt')
            persona_ids = persona_ids.to(device)
            VOCAB_TOKENS = tokenizer.get_added_vocab()

            last_index = 0
            steps = len(user_inputs)
            history = []
            
            if interact:
                steps = 15
            # global_step
            for step in range(steps):
                # print("-"*40,step)
                if interact:
                    user_input = input()
                else:
                    user_input = user_inputs[step]
                print(f"User: {user_input}")

                user_input = f"{SPECIAL_TOKENS['<sp_1>']}{user_input}{SPECIAL_TOKENS['</sp_1>']}{SPECIAL_TOKENS['<sp_2>']}"
                history.append(user_input)
                new_user_input_ids = tokenizer.encode(user_input, return_tensors='pt')
                new_user_input_ids = new_user_input_ids.to(device)
                history_chat = "".join(history[-3:])
                
                history_ids = tokenizer.encode(history_chat, return_tensors='pt')
                history_ids = history_ids.to(device)
                bot_input_ids = torch.cat([persona_ids, history_ids], dim=-1)

                # generated a response while limiting the total chat history to 1000 tokens, 
                model_response = model.generate(
                    bot_input_ids, 
                    max_length=350,
                    pad_token_id=tokenizer.eos_token_id,  
                    do_sample=True, 
                    num_beams=2, 
                    temperature = 0.95,
                    top_k=100, 
                    top_p=0.95,
                )

                model_response = model_response.to(device)
                model_response_list = list(model_response[0])
                
                end_speaker_index = Experiment.last_index(model_response_list, VOCAB_TOKENS['</sp_2>'])
                model_response = model_response[:, :end_speaker_index+1]

                chat_history_ids = model_response
                bot_response_decode = tokenizer.decode(chat_history_ids[0][len(bot_input_ids[0])-1:], skip_special_tokens=True) 
                last_history = history[-1]
                last_history = f"{last_history}{bot_response_decode}{SPECIAL_TOKENS['</sp_2>']}"
                
                history[-1] = last_history
                print(f"Bot: {bot_response_decode}")
                # print(history)
    @staticmethod
    def test_with_ranking(
            artifact_name="", 
            persona="", 
            user_inputs=None, 
            interact=False, 
            cuda=False,
            sentence_ranker=None,
            threshhold=0.35,
            persona_amount_sentences=3,
            beam=2
        ):
        
        model_folder = artifact_name
        print(model_folder)
        model = AutoModelForCausalLM.from_pretrained(model_folder)
        model.eval()
        tokenizer = AutoTokenizer.from_pretrained(model_folder)
        device = torch.device("cuda") if cuda else torch.device("cpu")
        model = model.to(device)
        print("Start conversation")
        VOCAB_TOKENS = tokenizer.get_added_vocab()

        steps = len(user_inputs)
        history = []
        
        if interact:
            steps = 15
        for step in range(steps):
            if interact:
                user_input = input()
            else:
                user_input = user_inputs[step]
            # get more relevant persona pieces
            persona, max_prob = sentence_ranker.rank_sentences([user_input], k=persona_amount_sentences)
            persona = " ".join(persona)

            print(f"Persona: {persona}")
            print(f"Dreaming: {True if max_prob < threshhold else False} - {max_prob} ")
            persona = f"{SPECIAL_TOKENS['<persona>']}{persona}{SPECIAL_TOKENS['</persona>']}"

            persona_ids = tokenizer.encode(persona, return_tensors='pt')
            persona_ids = persona_ids.to(device)
            
            print(f"User: {user_input}")

            user_input = f"{SPECIAL_TOKENS['<sp_1>']}{user_input}{SPECIAL_TOKENS['</sp_1>']}{SPECIAL_TOKENS['<sp_2>']}"
            history.append(user_input)

            new_user_input_ids = tokenizer.encode(user_input, return_tensors='pt')
            new_user_input_ids = new_user_input_ids.to(device)

            history_chat = "".join(history[-3:])
            print()
            history_ids = tokenizer.encode(history_chat, return_tensors='pt')
            history_ids = history_ids.to(device)

            bot_input_ids = torch.cat([persona_ids, history_ids], dim=-1)
            if max_prob > threshhold:
                model_response = model.generate(
                    bot_input_ids, 
                    max_length=250,
                    pad_token_id=tokenizer.eos_token_id,  
                    do_sample=True, 
                    num_beams=beam, 
                    temperature = 0.95,
                    top_k=100, 
                    top_p=0.95,
                )
            else:
                model_response = model.generate(
                    bot_input_ids, 
                    max_length=250,
                    pad_token_id=tokenizer.eos_token_id,  
                    do_sample=True, 
                    # num_beams=3, 
                    temperature = 0.95,
                    top_k=100, 
                    top_p=0.95,
                )

            model_response = model_response.to(device)
            model_response_list = list(model_response[0])
            end_speaker_index = len(bot_input_ids[0]) + model_response_list[len(bot_input_ids[0])+1:].index(VOCAB_TOKENS['</sp_2>'])

            bot_response_decode = tokenizer.decode(model_response[0][len(bot_input_ids[0])-1:end_speaker_index], skip_special_tokens=True) 
            last_history = history[-1]
            last_history = f"{last_history}{bot_response_decode}{SPECIAL_TOKENS['</sp_2>']}"
            
            history[-1] = last_history
            print(f"Bot: {bot_response_decode}")
            print()


## Создаем датасет

In [5]:
persona_chat_original = pd.read_csv("./persona_chat.csv")
# persona_chat_original = persona_chat_original
# persona_chat_original = persona_chat_original[:3000]
# persona_chat_original.head(3)

In [6]:
class PersonaChatGenerator:
	def __init__(self, 
		initial_dataset=None,
		tokenizer=None
	):
		self.initial_dataset = initial_dataset
		self.processed_dataset = []
		self.tokenizer = tokenizer
		self.process_dataset()

	def process_dataset(self):
		processed_dataset = {
			"persona": [],
			"history": [],
			# "target": []
		}

		sp_1_start = SPECIAL_TOKENS['<sp_1>']
		sp_1_end = SPECIAL_TOKENS['</sp_1>']
		sp_2_start = SPECIAL_TOKENS['<sp_2>']
		sp_2_end = SPECIAL_TOKENS['</sp_2>']
		persona_start = SPECIAL_TOKENS['<persona>']
		persona_end = SPECIAL_TOKENS['</persona>']
		relu = lambda x: x if x > 0 else 0 
		for i in range(len(self.initial_dataset)):
			persona = self.initial_dataset['Persona'].iloc[i]
			chat = self.initial_dataset['chat'].iloc[i].split("\n")
			chat = chat[:-1]
			history = []
			for j in range(len(chat)):
				reply = chat[j]
				if (j+1) % 2 == 0:
					reply = f"{sp_2_start}{reply}{sp_2_end}"
					history.append(reply)
					temp_history = history[relu(j-4):j+1]
					temp_history = "".join(temp_history)
					processed_dataset['persona'].append(persona)
					processed_dataset['history'].append(temp_history)

				else:
					reply = f"{sp_1_start}{reply}{sp_1_end}"
					history.append(reply)

		dataset = pd.DataFrame(data=processed_dataset)
		return dataset

train_dataset_csv, valid_dataset_csv = train_test_split(persona_chat_original, test_size=0.01)
train_dataset_csv, valid_dataset_csv = train_dataset_csv.reset_index(), valid_dataset_csv.reset_index()
train_dataset_generator = PersonaChatGenerator(
	initial_dataset=train_dataset_csv,
	tokenizer=tokenizer
)

valid_dataset_generator = PersonaChatGenerator(
	initial_dataset=valid_dataset_csv,
	tokenizer=tokenizer
)

In [7]:
class PersonaChatDataset(Dataset):
	def __init__(self, 
		initial_dataset=None,
		tokenizer=None,
		is_validation=False
	):
		self.initial_dataset = initial_dataset
		self.tokenizer = tokenizer
		self.is_validation = is_validation
	
	def __len__(self):
		return len(self.initial_dataset)
	
	def __getitem__(self, idx):
		row = self.initial_dataset.iloc[idx]
		persona = [item.strip() for item in row['persona'].split(".") if len(item) > 0 ]
		if not self.is_validation:
			random.shuffle(persona)
		persona = [item+". " for item in persona]
		persona[-1] = persona[-1][:-1]
		persona = [SPECIAL_TOKENS['<persona>']] + persona + [SPECIAL_TOKENS['</persona>']]
		
		persona = [torch.tensor(self.tokenizer.encode(item)).flatten() for item in persona]
		persona = torch.cat([*persona])

		history = row['history']
		
		history = self.tokenizer.encode(history)
		history = torch.tensor(history).flatten()
	
		feature = torch.cat([persona, history])

		return {
			"feature": feature,
			"target": feature 
		}

train_dataset = PersonaChatDataset(
	initial_dataset=train_dataset_generator.process_dataset(),
	tokenizer=tokenizer
)

valid_dataset = PersonaChatDataset(
	initial_dataset=valid_dataset_generator.process_dataset(),
	tokenizer=tokenizer,
	is_validation=True
)

def collate(examples):
	features = [item['feature'] for item in examples]
	features = pad_sequence(features, batch_first=True)
	
	target = [item['target'] for item in examples]
	target = pad_sequence(features, batch_first=True)
	return features.to(torch.long), target.to(torch.long) 

train_dataloader = DataLoader(
    train_dataset, 
	batch_size=4, 
	collate_fn=collate, 
	drop_last = True,
	shuffle=True
)

valid_dataloader = DataLoader(
    valid_dataset, 
	batch_size=8, 
	collate_fn=collate, 
	drop_last=False,
	shuffle=False
)

In [8]:
tokenizer.decode(train_dataset[4]['feature'])

'<persona>i am bald with a thick beard. i enjoy comedies. i exercise often and have nice muscles. i work as an attorney. i wear nice clothes.</persona><sp_2>do you have a lot of them?</sp_2><sp_1>i do actually! i have 7. i am only 19 years old.</sp_1><sp_2>that is a lot. i do not have any.</sp_2><sp_1>do you like junk food? i like pizza puffs.</sp_1><sp_2>i do not. i am on a diet, gotta maintain my muscles.</sp_2>'

## тренируем модель

In [9]:
exp_config = {
    "batch_size": 4,
    "check_interval": 100,
    "epochs": 1,
    "optimizer": {
        "lr": 5e-5
    },
    "model_name": "pytorch_model",
    "saved_path": "",
    "do_weight_decay": False,
    "weight_decay": 0.0,
    "freeze_layers": 2
}

exp_config["sheduler"] = {
    # "max_lr": 0.01, 
    # "steps_per_epoch": len(train_dataloader), 
    # "epochs": exp_config["epochs"]
    # "step_size": 25
    "num_warmup_steps": 1000,
    "num_training_steps": len(train_dataloader)

}
model =  AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium", config=config, ignore_mismatched_sizes=True)
# model =  AutoModelForCausalLM.from_pretrained("/home/dimweb/sandbox/persona_bot/gpt_persona_v1/wandb/run-20220708_230130-aurwatvq_local")
model.resize_token_embeddings(len(tokenizer))
# model = GPT2LMModel(**exp_config['model_args'])

# не хочу создавать глобальные переменные 
exp_params = {
    "model": model, 
    "tokenizer": tokenizer,
    "dataloader_train": train_dataloader,
    "dataloader_valid": valid_dataloader,
    "dataloader_test": valid_dataloader,
    "loss_func_class": nn.CrossEntropyLoss,
    "estimate_func_class": nn.CrossEntropyLoss,
    "experiment_config": exp_config,
    "optimizer_class": torch.optim.Adam,
    "sheduler_class": None,
    "notebook_name": "gpt_persona_v1.ipynb",
    "project_name": "gpt_persona_bot",
    "name_run": "persona_gpt",
    "model_description": "Уменьшил до 2 блоков",
    "do_unit_tests": True,
}

experiment_test = Experiment(**exp_params)

Using device cuda
tests ok


In [10]:
len(train_dataloader)

16264

In [11]:
experiment_test.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdimweb[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch: 0




train end


VBox(children=(Label(value='0.048 MB of 0.048 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train_loss,█▅▇▅▅▅▆▆▃▄▅▅▆▆▆▅▅▄▅▄▆▅▆▄▃▄▄▄▂▃▄▅▅▄▁▅▄▂▄▃
train_perplexity,█▄▆▃▄▄▅▅▂▃▄▄▄▅▅▄▄▃▄▃▅▃▄▃▂▃▂▃▂▂▃▄▃▃▁▄▃▁▃▂
val_loss,▁
valid_perplexity,▁

0,1
train_loss,1.53061
train_perplexity,4.621
val_loss,1.38318
valid_perplexity,3.98755


In [10]:
print(valid_dataset_csv.iloc[0]['chat'])

what are you up to this evening ?
air in my area is fresh that is why i play sports
did you just come in from playing sports ?
i am working out now , and drinking water
i just got done sewing a new shirt
no , i am working out now , and watching tv
i have no time for tv being an art teacher
that is nice of you , i am getting a degree at school now
oh really what are you getting your degree in
it is good to be busy sometimes , keeps you focus
it really does which is why i love to sew my own clothing
i am working on psychology because i had 2 car crashes in past
oh wow do you like to visit thrift shops ?
it affects my driving now , because of fear .



In [10]:
from sentence_transformers import SentenceTransformer
sentence_model = SentenceTransformer('nli-distilroberta-base-v2')

persona = open("./persona_sentences_2.txt").read()

class SentenceRanker:
	def __init__(self, persona_sentences=None, sentence_model=None):
		self.persona_sentences = persona_sentences
		self.sentence_model = sentence_model
		self.sentence_embeddings = self.sentence_model.encode(
			persona_sentences, 
			convert_to_tensor=True
		)
		self.ranked_sentences = {}
	
	def rank_sentences(self, query, k=5):
		key = f"{query}_{k}"
		if self.ranked_sentences.get(key, False):
			return self.ranked_sentences[key]
		user_sentence_embeddings = sentence_model.encode(query, convert_to_tensor=True)

		cos_sim_ranks = self.cos_sim(
			user_sentence_embeddings,
			self.sentence_embeddings
		)
		
		top_indices = torch.argsort(cos_sim_ranks, descending=True)
		max_prob = cos_sim_ranks[top_indices][0]
		top_indices = list(top_indices[:k].cpu().numpy())
		similar_sentences = [self.persona_sentences[idx] for idx in top_indices]
		self.ranked_sentences[key] = similar_sentences 
		return similar_sentences, max_prob
	
	def cos_sim(self, a, b):
		a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
		b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
		return torch.sum(a_norm * b_norm, dim=1)


persona_sentences = persona.split("\n")
persona_sentences = [item.strip() for item in persona_sentences if len(item) > 0]
sentence_ranker = SentenceRanker(
	persona_sentences=persona_sentences,
	sentence_model=sentence_model
)


number_example = 16
persona = valid_dataset_csv['Persona'][number_example]

user_inputs = [
	"What do you think about dogs?",
    "Hi. What is your name?",
    "What do you like?",
    "What is your job?",
	"Tell me about yourself please.",
	"Where is your dad?",
]

# user_inputs = [item for i, item in enumerate(valid_dataset_csv.iloc[number_example]['chat'].split("\n")) if len(item) and (i+1) % 2 != 0]

Experiment.test_with_ranking(
	# artifact_name="/data/home/dimweb/sandbox/persona_bot/gpt_persona_v1/wandb/run-20220716_201842-1ezhlysz_local",
	artifact_name="/data/home/dimweb/sandbox/persona_bot/gpt_persona_v1/wandb/run-20220710_230742-8ryub57u_local",
	# artifact_name="/data/home/dimweb/sandbox/persona_bot/gpt_persona_v1/wandb/run-20220711_171905-u49i12fh_local",
	persona=persona,
	user_inputs=user_inputs,
	interact=False,
	cuda=True,
	sentence_ranker=sentence_ranker,
	threshhold=0.5,
	persona_amount_sentences=3,
	beam=2
)

/data/home/dimweb/sandbox/persona_bot/gpt_persona_v1/wandb/run-20220710_230742-8ryub57u_local
Start conversation
Persona: I would love to have a dog in future. I love dogs so much. Dogs are the best friends. I do not have a dog yet.
Dreaming: True - 0.4889436960220337 
User: What do you think about dogs?


ValueError: 50260 is not in list

## Ranking paragraphs

In [5]:
from sentence_transformers import SentenceTransformer
sentence_model = SentenceTransformer('nli-distilroberta-base-v2')

persona = open("./persona_sentences.txt").read()

class SentenceRanker:
	def __init__(self, persona_sentences=None, sentence_model=None):
		self.persona_sentences = persona_sentences
		self.sentence_model = sentence_model
		self.sentence_embeddings = self.sentence_model.encode(
			persona_sentences, 
			convert_to_tensor=True
		)
		self.ranked_sentences = {}
	
	def rank_sentences(self, query, k=5):
		key = f"{query}_{k}"
		if self.ranked_sentences.get(key, False):
			return self.ranked_sentences[key]
		user_sentence_embeddings = sentence_model.encode(query, convert_to_tensor=True)

		cos_sim_ranks = self.cos_sim(
			user_sentence_embeddings,
			self.sentence_embeddings
		)
		
		top_indices = torch.argsort(cos_sim_ranks, descending=True)
		max_prob = cos_sim_ranks[top_indices]
		top_indices = list(top_indices[:k].cpu().numpy())
		similar_sentences = [self.persona_sentences[idx] for idx in top_indices]
		self.ranked_sentences[key] = similar_sentences 
		return similar_sentences, max_prob
	
	def cos_sim(self, a, b):
		print(a.shape, b.shape)
		a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
		b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
		return torch.sum(a_norm * b_norm, dim=1)


persona_sentences = persona.split("\n")
persona_sentences = [item for item in persona_sentences if len(item) > 0]
sentence_ranker = SentenceRanker(
	persona_sentences=persona_sentences,
	sentence_model=sentence_model
)

user_sentence = [
	"Tell me about yourself please."
]

sentence_ranker.rank_sentences(
	user_sentence, 5
)


(['If I could have any superpower, it would be to speak any language so I could connect with anyone in the world.',
  'One of my hidden talents is negotiating with others, which I believe is what makes me a strong sales representative.',
  'My favourite subject in school was English, which is why I decided to become a writer.',
  "When I was younger, I wanted to be a pilot, which led me to later earn my pilot's license.",
  'My favourite part about my job is pitching unique advertising campaign ideas that help small businesses stand out from competitors.'],
 tensor([ 0.3229,  0.2916,  0.2554,  0.2393,  0.2238,  0.2046,  0.1786,  0.1779,
          0.1743,  0.1632,  0.1603,  0.1571,  0.1568,  0.1431,  0.1418,  0.1247,
          0.1121,  0.1099,  0.0970,  0.0930,  0.0860,  0.0735,  0.0626,  0.0546,
          0.0409,  0.0323,  0.0272, -0.0027, -0.0079, -0.0209], device='cuda:0'))