In [68]:
import re
import pandas as pd

from tqdm.auto import tqdm

In [69]:
SEED = 42

In [70]:
prompt_dataset = pd.read_csv('1M_midjourney_prompts.csv')

prompt_dataset.rename(columns={
    'Mind Manifesting poseidon in ocean, holding trident, sharks, rpg style, ': 'prompt',
    'Random, Pose, Old, Style, Mind Manifesting': 'keyword'
    },inplace=True)

In [71]:
prompt_dataset.head()

Unnamed: 0,prompt,keyword
0,Diagram Make The Starling Night paintings real...,"Random, 4K, Reality, Paint, Painting, Cinemati..."
1,"Magical Girl cute kitten wearing suit, teachin...","Random, Cartoon, Ground, Art, Bright, Style, C..."
2,"Frank Martin sleeve tattoo , wave, style of Ho...","Random, Tattoo, Sleeve, Style, Dragon, Frank M..."
3,Close Face /imagine futuristic magazine ad for...,"Random, Zine, Futuristic, Pan, Close Face"
4,Otaku wide shot of a huge red giant star in sp...,"Random, In The Distance, Silhouette, Spaceship..."


In [72]:
len(prompt_dataset)

999999

In [73]:
def get_keywords(keyword):
    return [[k.strip().lower() for k in keyword.split(', ') if k != 'Random' and k != '']]

In [74]:
prompt_dataset['option'] = prompt_dataset['keyword'].apply(lambda x: get_keywords(x))
prompt_dataset.drop(['keyword'], axis=1, inplace=True)

In [75]:
prompt_dataset.head()

Unnamed: 0,prompt,option
0,Diagram Make The Starling Night paintings real...,"[[4k, reality, paint, painting, cinematic, woo..."
1,"Magical Girl cute kitten wearing suit, teachin...","[[cartoon, ground, art, bright, style, cute, c..."
2,"Frank Martin sleeve tattoo , wave, style of Ho...","[[tattoo, sleeve, style, dragon, frank martin]]"
3,Close Face /imagine futuristic magazine ad for...,"[[zine, futuristic, pan, close face]]"
4,Otaku wide shot of a huge red giant star in sp...,"[[in the distance, silhouette, spaceship, wide..."


In [76]:
for i, row in prompt_dataset.iterrows():
    if i > 3: break

    print('Prompt:', row['prompt'])
    print('Option:', row['option'])
    
    print('\n')

Prompt: Diagram Make The Starling Night paintings reality, in the woods, 4k , cinematic, 
Option: [['4k', 'reality', 'paint', 'painting', 'cinematic', 'wood', 'diagram']]


Prompt: Magical Girl cute kitten wearing suit, teaching in class room, bright light, clear background, cartoon style, v5, 
Option: [['cartoon', 'ground', 'art', 'bright', 'style', 'cute', 'cut', 'background', 'magical girl']]


Prompt: Frank Martin sleeve tattoo , wave, style of Hokusai, cherry blossoms, setting sun, D&d dragon, Caffeine molecular compound, 
Option: [['tattoo', 'sleeve', 'style', 'dragon', 'frank martin']]


Prompt: Close Face /imagine futuristic magazine ad for a tech company called Moon, 
Option: [['zine', 'futuristic', 'pan', 'close face']]




In [77]:
def get_option_dist(df):
    option_dict = {}

    for i, row in tqdm(df.iterrows(), total=len(df)):
        for k in row['option']:
            if k in option_dict:
                option_dict[k] += 1
                
            else:
                option_dict[k] = 1
    
    
    sorted_option = sorted(option_dict.items(), key=lambda item: item[1], reverse=True)
    
    print('Top 5 options:', sorted_option[:5])
    print('len:', len(sorted_option))
    print('\n')

In [78]:
from datasets import Dataset
prompt_dataset = Dataset.from_pandas(prompt_dataset)

prompt_dataset = prompt_dataset.train_test_split(test_size=0.1, seed=SEED)

# Split the train set further into train and validation sets (90% train, 10% validation)
prompt_dataset['train'] = prompt_dataset['train'].train_test_split(test_size=0.1, seed=SEED)

# Access the resulting datasets
train_dataset = prompt_dataset['train']['train'].to_pandas()
valid_dataset = prompt_dataset['train']['test'].to_pandas()
test_dataset = prompt_dataset['test'].to_pandas()

In [79]:
train_dataset.to_json('train.json', indent=4, force_ascii=False, orient='records')
valid_dataset.to_json('valid.json', indent=4, force_ascii=False, orient='records')
test_dataset.to_json('test.json', indent=4, force_ascii=False, orient='records')

In [80]:
print(len(train_dataset), len(valid_dataset), len(test_dataset))

809999 90000 100000


In [81]:
# get_option_dist(train_dataset)
# get_option_dist(valid_dataset)
# get_option_dist(test_dataset)

In [82]:
train_dataset.head()

Unnamed: 0,prompt,option
0,"Jasper Johns In the Astral Library, I fell dow...","[[ground, fur, dark, jasper johns]]"
1,Sharpness buffalo having romantic adventure wi...,"[[photo, ray, noise, drama, coarse, neon, dram..."
2,"Mystical Wes Anderson style, portrait photogra...","[[photo, 8k, photography, style, portrait phot..."
3,"Alternate Reality small animals with lion,","[[lion, alternate reality]]"
4,"Comic Book Art tall lantern in the park by day,",[[comic book art]]
