In [1]:
from datasets import load_dataset
from collections import Counter
import numpy as np
from data_collection import training_set
from transformers import GPT2Tokenizer
import pickle
import torch

In [2]:
romantic_data = load_dataset("AlekseyKorshuk/synthetic-romantic-characters")["train"]
friendly_data = load_dataset("AlekseyKorshuk/synthetic-friendly-characters")["train"]
fight_data = load_dataset("AlekseyKorshuk/synthetic-fight-characters")["train"]

In [5]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
train_data = training_set(["romantic"], tokenizer)
# print(train_data)

In [8]:
tokenizer.batch_decode(train_data[1].unsqueeze(0))[0]

'social justice,romance,inspiration,romantic<|endoftext|>compassionate,socially aware,passionate about human rights,always wears a bracelet with an equality symbol,quotes influential activists<|endoftext|>*Santiago walks up to you, wearing a shirt with an activist quote on it.* Hey there, I couldn’t help but notice you sitting here alone. Mind if I join you? *He smiles warmly and sits down.* I’m Santiago, by the way. What’s your name?\n*smile back* Hi Santiago, nice to meet you. My name is [Your Name].\n*Santiago nods in understanding* I see. Well, it’s a pleasure to meet you, [Your Name]. I’m an activist myself, fighting for social justice and equality. I believe that every person deserves the same opportunities in life, regardless of their race, gender, or background. It’s something that truly resonates with me, and I’m always eager to share my passion with others.\n*nodding* That’s amazing, it’s great to see people fighting for a cause they believe in.\n*Santiago smiles* Thank you, 

In [19]:
dialogue_dataset = []
for dataset in [romantic_data, friendly_data, fight_data]:
    for conversation in dataset:
        dialogue_dataset.append(conversation)
    
print(list(dialogue_dataset[0].keys()))

['name', 'categories', 'personalities', 'description', 'conversation']


In [20]:
def count_data(name, dataset):
    category_counter = Counter()
    personality_counter = Counter()
    for conversation in dataset:
        category_counter.update([c.lower() for c in conversation["categories"]])
        personality_counter.update([p.lower() for p in filter(lambda p: " " not in p, conversation["personalities"])])

    print(f"In {name}, there are {len(dataset)} conversations, {len(category_counter)} conversation categories, and {len(personality_counter)} types of personalities.")
    print("5 most common categories:")
    for name, count in category_counter.most_common(5):
        print("\t" + str(count) + "\t" + str(name))
    print("5 most common personalities:")
    for name, count in personality_counter.most_common(5):
        print("\t" + str(count) + "\t" + str(name))
    print()
    

for name, dataset in zip(["total", "romantic_data", "friendly_data", "fight_data"], [dialogue_dataset, romantic_data, friendly_data, fight_data]):
    count_data(name, dataset)

In total, there are 17668 conversations, 1565 conversation categories, and 777 types of personalities.
5 most common categories:
	3796	romance
	2574	fantasy
	1621	adventure
	1578	comedy
	1337	entertainment
5 most common personalities:
	1582	adventurous
	1211	empathetic
	970	mysterious
	956	calm
	805	creative

In romantic_data, there are 5744 conversations, 582 conversation categories, and 244 types of personalities.
5 most common categories:
	3645	romance
	754	fantasy
	752	travel
	740	music
	591	art
5 most common personalities:
	774	adventurous
	558	creative
	507	charming
	475	imaginative
	470	charismatic

In friendly_data, there are 3871 conversations, 531 conversation categories, and 266 types of personalities.
5 most common categories:
	689	support
	671	entertainment
	495	comedy
	440	education
	352	wellness
5 most common personalities:
	682	empathetic
	369	compassionate
	363	adventurous
	359	calm
	339	curious

In fight_data, there are 8053 conversations, 1169 conversation categories

In [30]:
print("Personality statistics:")
personality_counts = [len(item["personalities"]) for item in dialogue_dataset]
print(np.mean(personality_counts), np.std(personality_counts), min(personality_counts), max(personality_counts))

print("\nCategory statistics:")
category_counts = [len(item["categories"]) for item in dialogue_dataset]
print(np.mean(category_counts), np.std(category_counts), min(category_counts), max(category_counts))

print("\nConversation statistics:")
line_counts = [len(item["conversation"]) for item in dialogue_dataset]
print(min(line_counts), max(line_counts))
line_lengths = [len(line) for item in dialogue_dataset for line in item]
print(np.mean(line_lengths), np.std(line_lengths), min(line_lengths), max(line_lengths))
conversation_lengths = [sum(len(line) for line in item) for item in dialogue_dataset]
print(np.mean(conversation_lengths), np.std(conversation_lengths), min(conversation_lengths), max(conversation_lengths))

print("\nTotal length statistics:")
char_lengths = []
for item in dialogue_dataset:
    temp = sum(len(line["content"]) for line in item["conversation"])
    temp += sum(len(pers) for pers in item["personalities"])
    temp += sum(len(cat) for cat in item["categories"])
    char_lengths.append(temp)
print(np.mean(char_lengths), np.std(char_lengths), min(char_lengths), max(char_lengths))

token_lengths = []
for item in dialogue_dataset:
    temp = sum(len(line["content"].split(" ")) for line in item["conversation"])
    temp += sum(len(pers.split(" ")) for pers in item["personalities"])
    temp += sum(len(cat.split(" ")) for cat in item["categories"])
    token_lengths.append(temp)
print(np.mean(token_lengths), np.std(token_lengths), min(token_lengths), max(token_lengths))

Personality statistics:
4.364161195381481 0.591856041784797 2 10

Category statistics:
2.91283676703645 0.2858611286202068 2 4

Conversation statistics:
7 7
10.0 3.1622776601683795 4 13
50.0 0.0 50 50

Total length statistics:
1715.6083314466832 466.0260407307866 473 4622
312.28656327824314 84.09137336614894 82 792


In [2]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
test = training_set(["romantic", "friendly", "fight"], tokenizer)

Found cached dataset parquet (/home/linulun/.cache/huggingface/datasets/AlekseyKorshuk___parquet/AlekseyKorshuk--synthetic-romantic-characters-3b16d8e672467bfe/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/home/linulun/.cache/huggingface/datasets/AlekseyKorshuk___parquet/AlekseyKorshuk--synthetic-friendly-characters-8195740b6ede92c1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/home/linulun/.cache/huggingface/datasets/AlekseyKorshuk___parquet/AlekseyKorshuk--synthetic-fight-characters-dbee9baf48903647/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
pickle.dump(test, open("dataset.p", "wb"))

In [8]:
print(pickle.load(open("dataset.p", "rb")))

<data_collection.DialogueDataset object at 0x728e22b6b290>


In [22]:
import torch
print(torch.tensor([1]).squeeze().dim())
print(torch.tensor([1, 1]).squeeze().dim())

0
1
