# LIMIT Dataset Generation

This script provides code to generate a LIMIT-style dataset.

## Requirements

In [51]:
import numpy as np
import pandas as pd
import random
import tqdm
import math
import json
import itertools
import requests

random.seed(42)

## Download Names from open source lists (edit this as needed)



In [52]:


# URL of the CSV file
csv_url = "https://gist.githubusercontent.com/craigh411/19a4479b289ae6c3f6edb95152214efc/raw/d25a1afd3de42f10abdea7740ed098d41de3c330/List%2520of%2520the%25201,000%2520Most%2520Common%2520Last%2520Names%2520(USA)"

# Read the CSV file into a pandas DataFrame
common_surnames_df = pd.read_csv(csv_url, names=["Surname", "None"])
surname_list = common_surnames_df["Surname"].tolist()
print(f"First five last names: {surname_list[:5]}")


# URL of the Python file with names
python_url = "https://gist.githubusercontent.com/ruanbekker/a1506f06aa1df06c5a9501cb393626ea/raw/cef847b6402da0fe00977e7349a4dc3fbeb4df54/array-names.py"

# Download the Python file content
response = requests.get(python_url)
response.raise_for_status() # Raise an exception for bad status codes
python_content = response.text

# Execute the downloaded content to get the names list
exec(python_content)

# 'names' list should now be available in the local scope
if 'names' in locals():
    name_list = names
    print("First 5 first names:", name_list[:5])
else:
    raise Exception("\nCould not find 'names' variable in the downloaded Python content.")

unique_names = list(set(name_list))
unique_surnames = list(set(surname_list))
print(f"Found {len(unique_names)} unique names and {len(unique_surnames)} unique surnames.")

First five last names: ['Smith', 'Johnson', 'Williams', 'Brown', 'Jones']
First 5 first names: ['Aaran', 'Aaren', 'Aarez', 'Aarman', 'Aaron']
Found 2738 unique names and 1000 unique surnames.


## Items to Like

In [53]:
items_to_like = """
Apricots
Avocados
Blackberries
Blueberries
Cantaloupes
Cherries
Coconut Flour
Cranberries
Dragon Fruits
Grapefruits
Guavas
Honeydew Melons
Kiwis
Lemons
Limes
Lychees
Mangoes
Nectarines
Papayas
Passion Fruits
Peaches
Pears
Persimmons
Plums
Pomegranates
Raspberries
Star Fruits
Strawberries
Watermelons
Artichokes
Arugula
Asparagus
Beets
Broccoli
Cabbages
Cauliflower
Poblano Peppers
Ears of Corn
Cucumbers
Eggplants
Garlic
Green Beans
Green Bell Peppers
Habaneros
Jalapenos
Kale
Leeks
Iceberg Lettuce
Romaine Lettuce
Okra
Red Onions
Sweet Onions
Peas
Radishes
Red Bell Peppers
Shallots
Spinach
Butternut Squash
Spaghetti Squash
Sweet Potatoes
Zucchini
Begonias
Black-eyed Susans
Bleeding Hearts
Carnations
Chrysanthemums
Columbines
Coneflowers
Daffodils
Dahlias
Foxgloves
Gardenias
Geraniums
Gladiolus
Hibiscus
Hydrangeas
Irises
Jasmine
Lantanas
Lavender
Lilies
Marigolds
Orchids
Pansies
Peonies
Petunias
Poppies
Roses
Shasta Daisies
Snapdragons
Sunflowers
Tulips
Violets
Zinnias
Acacia Trees
Ash Trees
Aspen Trees
Banyan Trees
Baobab Trees
Birch Trees
Cedar Trees
Cherry Blossom Trees
Cypress Trees
Dogwood Trees
Elm Trees
Fir Trees
Ginkgo Trees
Hemlock Trees
Holly Trees
Jacaranda Trees
Joshua Trees
Juniper Trees
Magnolia Trees
Maple Trees
Oak Trees
Palm Trees
Pine Trees
Poplar Trees
Redwood Trees
Spruce Trees
Sycamore Trees
Willow Trees
Tillandsia
Aloe Vera
Bamboo
Basil
Bonsai Trees
Cacti
Catnip
Chives
Cilantro
Clover
Dandelions
Dill
Ferns
Fiddle Leaf Figs
Ivy
Monstera
Moss
Oregano
Parsley
Pitcher Plants
Pothos
Rosemary
Sage
Snake Plants
Spider Plants
Thyme
Venus Flytraps
Axolotls
Betta Fish
Canaries
Chinchillas
Cockatiels
Doves
Ferrets
Finches
Gerbils
Guinea Pigs
Guppies
Hamsters
Hedgehogs
Hermit Crabs
Bearded Dragon Lizards
Gecko Lizards
Macaws
Mice
Parakeets
Grey Parrots
Holland Lops
Flemish Giants
Pygmy Rabbits
Rex Rabbit
Netherland Dwarf Rabbits
Black Scoters
Artic Loons
Mallards
Blue Swedish Ducks
Fire Ants
Carpenter Ants
Black Garden Ants
Pharaoh Ants
Argentine Ants
Pavements Ants
Ghost Ants
Grease Ants
Bulldog Bats
Horseshoe Bats
Old World Fruit Bats
Big-eared Wooly Bats
Honduran White Bats
Carpenter Bees
Honey Bees
Bumblebees
Rats
Ball Python Snakes
Corn Snakes
Sugar Gliders
Tarantulas
Tortoises
Alpacas
Bison
Donkeys
Emus
Goats
Geese
Horses
Llamas
Mules
Ostriches
Santa's Reindeer
Sheep
Turkeys
Water Buffalo
Yaks
Alligators
Anacondas
Anteaters
Armadillos
Badgers
Baleen
Black Bears
Grizzly Bears
Panda Bears
Polar Bears
Beavers
Stag Beetles
Binturongs
Boa Constrictors
Bongos
Monarch Butterflies
Capybaras
Caracals
Northern Cardinals
Centipedes
Chameleons
Cheetahs
Chimpanzees
Chipmunks
Cicadas
Clams
Cobras
Crayfish
Crickets
Crocodiles
Crows
Cuttlefish
Damselflies
White-tailed Deer
Dolphins
Dragonflies
Eagles
Earthworms
Echidnas
Elephants
Elk
Fennec Foxes
Fireflies
Flamingos
Arctic Foxes
Red Foxes
Poison Dart Frogs
Gerenuks
Giraffes
Gorillas
Grasshoppers
Hawks
Hippopotamuses
Hummingbirds
Iguanas
Jackfish
Kangaroos
Koalas
Komodo Dragons
Ladybugs
Lemurs
Lobsters
Manatees
Maned Wolves
Markhors
Meerkats
Millipedes
Moose
Carpet Moth
Mussels
Nassarius
Newts
Ocelots
Blue-ringed Octopus
Okapi
Orangutans
River Otters
Owls
Oysters
Pangolins
Peacocks
Pelicans
Penguins
Pipefish
Platypuses
Porcupines
Praying Mantises
Puffins
Timor Pythons
Quokkas
Raccoons
Rattlesnakes
Ravens
Red Pandas
Rhinoceroses
Robins
Saiga Antelopes
Salamanders
Sand Dollars
Scallops
Scorpions
Sea Anemones
Sea Lions
Sea Otters
Sea Urchins
Seahorses
Seals
Servals
Great White Sharks
Hammerhead Sharks
Whale Sharks
Giant Tiger Prawns
Skunks
Sloths
Slugs
Snails
Snow Leopards
Sparrows
Bigfin Reef Squids
Squirrels
Starfish
Stick Insects
Swans
Tapirs
Tasmanian Devils
Toads
Wallabies
Walruses
Potter Wasps
Blue Whales
Humpback Whales
Orca Whales
Artic Wolves
Wolverines
Red Wolves
Wombats
Woodpeckers
Zebras
Anglerfish
Beluga Whales
Dumbo Octopuses
Frilled Sharks
Giant Isopods
Goblin Sharks
Horseshoe Crabs
Manta Rays
Mantis Shrimp
Narwhals
Nudibranchs
Pistol Shrimp
Sea Cucumbers
Sea Pens
Sperm Whales
Stingrays
Ocean Sunfish
Vampire Squid
Voluta
Yeti Crabs
Albatrosses
Atlas Moths
Bee-eaters
Glasswing Butterflies
Goliath Beetles
Hercules Beetles
Hoopoes
Hornbills
Jewel Wasps
Kingfishers
Luna Moths
Lyrebirds
Orchid Mantises
Quetzals
Rosy Maple Moths
Secretarybirds
Toucans
Walking Leaves
Bat Flowers
Bird of Paradise Flowers
Dragon's Blood Trees
Lotus Flowers
Passion Flowers
Protea Flowers
Rafflesia Flowers
Rainbow Eucalyptus Trees
Titan Arums
Welwitschia Mirabilis
Beaches
Caves
Clouds
Deserts
Forests
Islands
Lakes
Moon
Mountains
Rain
Rivers
Snow
Sun
Sunrise
Sunset
X-Rays
Volcanoes
Waterfalls
Wind
Amaranth
Bagels
Baguettes
Barley
Buckwheat
Ciabatta
Cornmeal
Couscous
Crepes
Croissants
Millet
English Muffins
Oats
Quinoa
Brown Rice
Turmeric
White Rice
Wild Rice
Sorghum
Spelt
Teff
Naan Bread
Pita Bread
Rye Bread
Sourdough Bread
Corn Tortillas
Flour Tortillas
Fusilli
Penne
Egg Noodles
Rice Noodles
Soba Noodles
Udon Noodles
Cheerios
Corn Flakes
Anchovies
Bacon
Calamari
Catfish
Cod
Ground Beef
Haddock
Halibut
Ham
Lamb
Mackerel
Pheasant
Pork Chops
Pulled Pork
Quail Meat
Beef Ribs
Pork Ribs
Roast Beef
Salmon
Sardines
Sausages
Sea Bass
Steak
Swordfish
Tilapia
Trout
Venison
Baked Potatoes
Bibimbap
Burritos
Ceviche
Chili
Coleslaw
Curries
Dumplings
Eggs Benedict
Falafel
Fettuccine Alfredo
Fish and Chips
French Fries
French Toast
Fried Chicken
Fried Eggs
Goulash
Gravy
Grilled Cheese Sandwiches
Guacamole
Hamburgers
Hot Dogs
Hummus
Kabobs
Kimchi
Lasagna
Macaroni and Cheese
Mashed Potatoes
Meatloaf
Moussaka
Omelettes
Onion Rings
Pad Thai
Paella
Pancakes
Pesto
Pho
Pepperoni Pizzas
Hawaiian Pizzas
Vegan Cheese Pizza
Meat Lovers Pizza
Deep Dish Pizza
Poached Eggs
Poke Bowls
Pot Pies
Quiches
Ramen
Risotto
Roasted Vegetables
Salsa Dip
Samosas
Scrambled Eggs
Shepherd's Pies
Spaghetti Bolognese
Spring Rolls
Steamed Vegetables
Stir-fries
Sushi
Tacos
Tandoori Chicken
Waffles
Chicken Salads
Egg Salads
Potato Salads
Tuna Salads
Americanos
Apple Cider
Bubble Tea
Cappuccinos
Eggnog
Espresso
Horchata
Hot Chocolate
Iced Coffee
Kombucha
Macchiatos
Matcha Lattes
Mochas
Smoothies
Sparkling Water
Tonic Water
Almond Milk
Coconut Milk
Oat Milk
Soy Milk
Apple Juice
Lemonade
Limeade
Orange Juice
Pineapple Juice
Tomato Juice
Cola Soda
Ginger Ale Soda
Grape Soda
Orange Soda
Root Beer Soda
Black Tea
Chai Tea
Green Tea
Herbal Tea
Iced Tea
Oolong Tea
White Tea
Agave Nectar
Allspice
Baking Powder
Baking Soda
Barbecue Sauce
Bay Leaves
Black Beans
Butter
Cardamom
Cashews
Cayenne Pepper
Cocoa Powder
Coriander
Cornstarch
Cumin
Food Coloring
All-Purpose Flour
Garbanzo Beans
Gelatin
Ghee
Honey
Hot Sauce
Ketchup
Kidney Beans
Lard
Lentils
Macadamia Nuts
Maple Syrup
Margarine
Mayonnaise
Prepared Mustard
Nutmeg
Pecans
Black Pepper
Pinto Beans
Akbari Pistachios
Salt
Seitan
Soy Sauce
Star Anise
Brown Sugar
Powdered Sugar
White Sugar
Tempeh
Tofu
Walnuts
Worcestershire Sauce
Yeast
Canola Oil
Olive Oil
Sesame Oil
Vegetable Oil
Flax Seeds
Pumpkin Seeds
Sesame Seeds
Apple Cider Vinegar
Balsamic Vinegar
Red Wine Vinegar
White Vinegar
Cloves
Hazelnut Flavor
Vanilla
Affogatos
Alfajores
Ambrosia Salads
Animal Crackers
Apple Slices
Baklava
Banana Chips
Banana Splits
Beef Jerky
Beignets
Biscotti
White Chocolate Brownies
Cannoli
Caramel deLites
Celery with peanut butter
Cheesecake
Chips Ahoy!
Churros
Cinnamon Rolls
Clafoutis
Cobblers
Cosmic Brownies
Cottage Cheese
Club Crackers
Cream Puffs
Creme Brulee
Crumbles
Custard
Deviled Eggs
Ding Dongs
Dippin' Dots
Donuts
Eclairs
Edamame
Energy Balls
Financiers
Flan
Root Beer Floats
Frosting
Fruit Snacks
Funnel Cakes
Gelato
Gingerbread
Goldfish Crackers
Graham Crackers
Granola Bars
Halva
Ho Hos
Hot Fudge
Ice Cream Cones
Ice Cream Sandwiches
Ice Cream Sundaes
Italian Ice
Jello
Jelly
Linzer Tortes
Coconut Macaroons
Macarons
Madeleines
Malts
Marmalade
Marzipan
Milanos
Milkshakes
Mochi
Moon Pies
Mousse
Sweet Muffins
Nougat
Nutty Buddies
Nutter Butters
Olives
Oreos
Panna Cotta
Parfaits
Pickles
Pita Chips
Popcorn
Popsicles
Potato Chips
Pretzels
Protein Bars
Rice Cakes
Rice Krispie Treats
Scones
Seaweed Snacks
Shaved Ice
Shortbread
Slushies
Sorbet
Souffles
Sprinkles
Strudels
Swiss Rolls
Tagalongs
Tarts
Thin Mints
Tiramisu
Tortilla Chips
Trail Mix
Trifles
Turkish Delight
Turnovers
Twinkies
Veggie Straws
Whipped Cream
Whoopie Pies
Coconut Yogurt
Zebra Cakes
Angel Food Cakes
Black Forest Cakes
Bundt Cakes
Carrot Cakes
Coffee Cakes
Fruitcakes
Icebox Cakes
Layer Cakes
Pound Cakes
Red Velvet Cakes
Sheet Cakes
Sponge Cakes
Upside-Down Cakes
Candy Canes
Caramel Candies
Cotton Candy
Jelly Beans
Licorice
Lollipops
Marshmallows
Toffee
Truffles
Dark Chocolate
Milk Chocolate
Chewing Gum
Apple Pies
Key Lime Pies
Oatmeal Creme Pies
Pecan Pies
Pumpkin Pies
Bread Puddings
Chia Seed Puddings
Rice Puddings
Tapioca Puddings
Caramel Sauce
Chocolate Chip Cookies
Oatmeal Raisin Cookies
Sugar Cookies
Frozen Yogurt
Asafoetida
Caraway
Celery Seed
Chervil
Chinese Five Spice
Fennel
Fenugreek
Garam Masala
Herbes de Provence
Juniper Berries
Mace
Marjoram
Mustard Seed
Green Peppercorns
Pink Peppercorns
Sichuan Peppercorns
White Pepper
Poppy Seed
Saffron
Smoked Paprika
Sumac
Za'atar
Arepas
Bammy
Blue Cheese
Brie
Brioche
Camembert
Challah
Cornbread
Feta Cheese
Focaccia
Goat Cheese
Gouda
Halloumi
Havarti
Injera
Lavash
Manchego
Mascarpone
Muenster
Multigrain Bread
Paneer
Potato Bread
Provolone
Ricotta
Acrobatics
Aerobics
Aerial Silks
Archery
Backpacking
Badminton
Barre
Basketball
Billiards
BMX Biking
Bocce Ball
Bodyboarding
Bowling
Boxing
Calisthenics
Canoeing
Cheerleading
Cricket
Croquet
CrossFit
Curling
Darts
Disc Golf
Equestrianism
Fencing
Fishing
Freediving
Ultimate Frisbee
Handball
Field Hockey
Ice Hockey
Jet Skiing
Judo
Karate
Kayaking
Kickboxing
Kitesurfing
Lacrosse
Marathon Running
Obstacle Course Racing
Parkour
Pilates
Power Walking
Rock Climbing
Roller Skating
Rowing
Rugby
Sailing
Scootering
Scuba Diving
Marksmanship
Shuffleboard
Skimboarding
Snorkeling
Snowboarding
Snowshoeing
Soccer
Softball
Spinning
Stair Climbing
Stand-up Paddleboarding
Lap Swimming
Synchronized Swimming
Table Tennis
Tae Kwon Do
Tai Chi
Tobogganing
Trail Running
Trampolining
Triathlons
Tumbling
Volleyball
Wakeboarding
Water Polo
Water Skiing
Weightlifting
Windsurfing
Wrestling
Yoga
Zumba
Ballroom Dancing
Hip Hop Dancing
Pole Dancing
Salsa Dancing
Swing Dancing
Tango Dancing
Tap Dancing
Figure Skating
Speed Skating
Mountain Biking
Road Cycling
Track Cycling
Cross-country Skiing
Inline Skating
Longboarding
Apples to Apples
Arcade Games
Backgammon
Bananagrams
Battleship
Bingo
Bridge
Canasta
Carcassonne
Cards Against Humanity
Catan
Charades
Checkers
Chess
Clue
Codenames
Connect Four
Cribbage
Dominoes
Dungeons & Dragons
Escape Rooms
Euchre
Exploding Kittens
The Game of Life
Gin Rummy
Go
Hearts
Jenga
Juggling
Kite Flying
LARPing
Lotteries
Magic: The Gathering
Mahjong
Mazes
Monopoly
Pandemic
Phase 10
Pictionary
Pinball
Poker
Riddles
Risk
Rubik's Cubes
Scrabble
Secret Hitler
Skip-Bo
Solitaire
Spades
Telestrations
Ticket to Ride
Trivia
Twister
Uno
Warhammer 40
Werewolf
Word Searches
Yahtzee
Yo-yos
Crossword Puzzles
Jigsaw Puzzles
Sudoku Puzzles
Action Video Games
Adventure Video Games
Platformer Video Games
Puzzle Video Games
Racing Video Games
Role-Playing Video Games
Sandbox Video Games
Simulation Video Games
Sports Video Games
Strategy Video Games
Antiquing
Architecture
Astrophotography
Batik
Beatboxing
Birdwatching
Block Printing
Blogging
Bookbinding
Candle Making
Card Making
Collage
Composing Music
Cosplay
Crocheting
Cross-stitching
Digital Art
DJing
Embroidery
Etching
Fashion Design
Filmmaking
Flower Arranging
Game Design
Genealogy
Geocaching
Glassblowing
Graffiti Art
Graphic Design
Homebrewing
Improvisational Theater
Interior Design
Knitting
Landscaping
Learning a New Language
Leatherworking
Lithography
Macrame
Magic Tricks
Mentoring
Metalworking
Mixology
Model Building
Mosaic
Mural Painting
Music Production
Nail Art
Origami
Acrylic Painting
Oil Painting
Watercolor Painting
Papier-mache
Playwriting
Poetry Slams
Podcasting
Pottery
Puppetry
Reading
Screen Printing
Screenwriting
Scrapbooking
Sculpting
Sewing
Singing
Sketching
Soap Making
Songwriting
Stained Glass
Stand-up Comedy
Stargazing
Storytelling
Streaming
3D Modeling
Thrifting
Tie-dye
Traveling
Turntablism
Tutoring
Urban Exploration
Ventriloquism
Videography
Vlogging
Volunteering
Weaving
Wood Carving
Wood Burning
Poetry Writing
Architectural Photography
Fashion Photography
Food Photography
Landscape Photography
Macro Photography
Photojournalism
Portrait Photography
Street Photography
East Asian Calligraphy
Western Calligraphy
Beachcombing
Mason Bees
Building Sandcastles
Building Snowmen
Bushcraft
Cloud Watching
Composting
Flying Drones
Foraging
Fossil Hunting
Having a snowball fight
Herbalism
Homesteading
Knot Tying
Leaf Peeping
Magnet Fishing
Making Snow Angels
Metal Detecting
Mushroom Hunting
Nature Journaling
Picnicking
Rock Balancing
Rock Collecting
Sunbathing
Visiting Aquariums
Visiting Botanical Gardens
Visiting National Parks
Visiting Zoos
Whittling
Agriculture Science
Anthropology
Archaeology
Art History
Astronomy
Botany
Cartography
Chemistry
Classics
Cognitive Science
Computer Science
Criminology
Economics
Education
Environmental Science
Ethics
Forestry
Genetics
Geography
Geology
Immunology
International Relations
Law
Marine Biology
Mathematics
Meteorology
Microbiology
Music Theory
Neuroscience
Oceanography
Paleontology
Performing Arts Studies
Pharmacology
Philosophy
Physics
Political Science
Psychology
Public Health
Robotics
Social Work
Sociology
Statistics
Theology
Toxicology
Urban Studies
Zoology
Aerospace Engineering
Biomedical Engineering
Chemical Engineering
Civil Engineering
Electrical Engineering
Environmental Engineering
Mechanical Engineering
Action Movies
Adventure Movies
Animated Movies
Autobiographies
Biographies
Children's Literature
Comedy Literature
Comedy Movies
Cooking Shows
Documentary Films
Documentary Series
Drama Literature
Drama Movies
Dystopian Fiction
Essays
Fables
Fairy Tales
Family Movies
Fantasy Literature
Fantasy Movies
Folklore
Game Shows
Historical Fiction
Horror Literature
Horror Movies
Legal Dramas
Legends
Medical Dramas
Memoirs
Musicals
Mystery Literature
Mythology
Police Procedurals
Reality Competitions
Romance Literature
Satire
Sci-Fi Movies
Science Fiction Literature
Sitcoms
Talk Shows
Thriller Literature
Thriller Movies
Tragedy Literature
Travelogues
True Crime
Western Movies
Young Adult Fiction
A cappella music
Abstract Expressionism
Alternative Rock
Ambient Music
Art Deco
Art Nouveau
Barbershop quartets
Baroque Art
Baroque Music
Bluegrass Music
Blues Music
Byzantine Art
Conceptual Art
Concertos
Country Music
Cubism
Dadaism
Disco Music
Electronic Music
Fauvism
Film scores
Folk Music
Fugues
Funk Music
Futurism
Gospel Music
Gothic Art
Gregorian chants
Heavy Metal Music
Hip Hop Music
House Music
Impressionism
Indie Music
Industrial music
Lo-fi music
Madrigals
Minimalism
Minimalist music
Neoclassicism
New-age music
Opera
Photorealism
Pointillism
Pop Art
Pop Music
Post-Impressionism
Punk Rock
R&B Music
Reggae Music
Renaissance Art
Rococo Art
Romantic Music
Romanticism
Sea shanties
Sonatas
Soul Music
Surrealism
Symphonies
Techno Music
Vaporwave
Video game music
World music
The Age of Discovery
The Age of Enlightenment
Ancient Egypt
Ancient Greece
The Aztec Empire
The Belle Epoque
The Bronze Age
The Counterculture of the 1960s
The Digital Age
Early Modern Period
The Elizabethan Era
Feudal Japan
The Gilded Age
The Golden Age of Piracy
The Hanseatic League
The Inca Empire
The Industrial Revolution
The Information Age
The Iron Age
Late Antiquity
The Mayan Civilization
The Ming Dynasty
The Ottoman Empire
Post-War Boom
Prehistory
The Progressive Era
The Regency Era
The Roaring Twenties
The Roman Empire
The Silk Road
The Space Age
The Stone Age
The Victorian Era
The Viking Age
The Wild West
Balconies
Bath Mats
Batteries
Beds
Bed Sheets
Binders
Blankets
Blinds
Bookshelves
Brooms
Buckets
Candle Holders
Chairs
Clocks
Coasters
Coffee Tables
Bedroom Curtains
Desks
Dishwashers
Doors
Dressers
Dryers
Dustpans
Duvets
Elevators
Envelopes
Extension Cords
Fireplaces
Folders
Hangers
Highlighters
Clothing Irons
Ironing Boards
Keys
Lamps
Laundry Baskets
Lightbulbs
Microwaves
Mirrors
Mops
Napkins
Nightstands
Notebooks
Ovens
Paper
Paper Clips
Patios
Pencils
Pens
Picture Frames
Pillows
Placemats
Post-it Notes
Printers
Quilts
Refrigerators
Remote Controls
Routers
Rugs
Scissors
Shower Curtains
Smartphones
Soap Dishes
Speakers
Sponges
Stairs
Staplers
Stoves
Tablecloths
Dining Tables
Tablets
Tape Dispensers
Televisions
Toothbrushes
Towels
Trash Cans
Vacuum Cleaners
Vases
Wardrobes
Washing Machines
Windows
Backpacks
Beanies
Belts
Boots
Bow Ties
Bracelets
Button-down Shirts
Caps
Dresses
Earrings
Eyeglasses
Handbags
Hoodies
Jackets
Jeans
Jumpsuits
Leggings
Mittens
Necklaces
Overalls
Pajamas
Pocket Watches
Polo Shirts
Raincoats
Rings
Robes
Sandals
Scarves
Shorts
Skirts
Slippers
Sneakers
Socks
Suits
Sunglasses
Sweaters
Swimsuits
T-shirts
Ties
Tights
Tuxedos
Umbrellas
Vests
Wallets
Watches
Abacuses
Air Compressors
Axes
Binoculars
Caulking Guns
Chainsaws
Clamps
Compasses
Drills
Duct Tape
Fire Extinguishers
Flashlights
Garden Hoses
Hammers
Ladders
Lawn Mowers
Leaf Blowers
Levels
Magnifying Glasses
Mallets
Microscopes
Multimeters
Paint Rollers
Paintbrushes
Pliers
Pressure Washers
Pruning Shears
Putty Knives
Rakes
Safety Goggles
Sanders
Hack Saws
Screwdrivers
Shovels
Slide Rules
Smoke Detectors
Soldering Irons
Stud Finders
Tape Measures
Telescopes
Toolboxes
Trowels
Utility Knives
Vises
Watering Cans
Wheelbarrows
Work Gloves
Workbenches
Wrenches
Zip Ties
Accordions
Acoustic Guitars
Amplifiers
Bagpipes
Banjos
Bass Guitars
Bassoons
Bongo Drums
CD Players
Cassette Players
Cellos
Clarinets
Conga Drums
Cowbells
Cymbals
Didgeridoos
Djembes
Double Basses
Drum Kits
Electric Guitars
Electronic Keyboards
Flutes
French Horns
Glockenspiels
Harmonicas
Harps
MP3 Players
Mandolins
Maracas
Metronomes
Oboes
Pianos
Pipe Organs
Radios
Recorders
Record Players
Saxophones
Sitars
Synthesizers
Tambourines
Trombones
Trumpets
Tubas
Tuning Forks
Triangles
Ukuleles
Vibraphones
Violas
Violins
Xylophones
Affection
Amusement
Autonomy
Belonging
Calmness
Closure
Collaboration
Community
Compassion
Confidence
Contentment
Courage
Creativity
Curiosity
Delight
Empathy
Enthusiasm
Euphoria
Excitement
Fika
Forgiveness
Friendship
Frisson
Generosity
Gratitude
Growth
Honesty
Hope
Humility
Humor
Hygge
Ikigai
Inspiration
Integrity
Interest
Kindness
Legacy
Love
Loyalty
Mastery
Optimism
Patience
Perseverance
Playfulness
Pride
Purpose
Relief
Respect
Responsibility
Reverence
Satisfaction
Spontaneity
Teamwork
Triumph
Vindication
Wabi-sabi
Wisdom
Wonder
Asymmetry
Catharsis
Cool Breezes
Daydreaming
Déjà vu
Epiphanies
Flow State
Fluffy Clouds
Focus
Glistening Dewdrops
Imagination
Intuition
Lucid Dreaming
Meditation
Mental Clarity
Mindfulness
Nostalgia
Organization
Patterns
Rainbows
Relaxation
Serendipity
Solitude
Symmetry
Synchronicity
The Feeling of a Fluffy Blanket
The Feeling of a Smooth Stone
The Feeling of Cool Water
The Feeling of Soft Grass
The Feeling of Warm Sand
The Quiet of a Forest
The Scent of a Campfire
The Scent of Baking Bread
The Scent of Flowers
The Scent of Pine Needles
The Smell of Fresh-Cut Grass
The Smell of Rain
The Sound of a Babbling Brook
The Sound of a Crackling Fire
The Sound of Birdsong
The Sound of Crickets Chirping
The Sound of Ocean Waves
The Sound of Wind in the Trees
The Andromeda Galaxy
Asteroids
Auroras
The Big Dipper
Black Holes
Comets
Earth
Black Eye Galaxy
Jupiter
Lunar Eclipses
Mars
Mercury
Meteors
The Milky Way Galaxy
Nebulae
Neptune
Planetary Rings
Pluto
Pulsars
Quasars
Mimas
Solar Eclipses
Star Clusters
Supernovae
Uranus
Venus
Atolls
Basins
Buttes
Cliffs
Continental Shelves
Coral Reefs
Dunes
Fjords
Fossils
Geysers
Glaciers
Gorges
Hills
Hot Springs
Icebergs
Isthmuses
Lagoons
Mangrove Forests
Marshes
Mesas
Natural Arches
Oceanic Trenches
Peninsulas
Plains
Plateaus
Ravines
River Deltas
Savannas
Stalagmites
Stalactites
Steppes
Swamps
Taiga
Tectonic Plates
Tundra
Valleys
Agate
Amethyst
Aquamarine
Citrine
Copper
Diamonds
Emeralds
Garnets
Gold
Hematite
Iron
Jade
Jasper
Lapis Lazuli
Malachite
Moonstone
Obsidian
Opals
Peridot
Pyrite
Steel
Milky Quartz
Rose Quartz
Rubies
Sapphires
Silver
Sunstone
Tiger's Eye
Topaz
Tourmaline
Turquoise
Accountants
Actors
Architects
Astronauts
Authors
Bakers
Barbers
Bartenders
Blacksmiths
Butchers
Carpenters
Chefs
Chemists
Cleaners
Coaches
Dancers
Dentists
Detectives
Doctors
Drivers
Economists
Editors
Electricians
Farmers
Firefighters
Fishers
Florists
Geologists
Hairdressers
Historians
Janitors
Journalists
Judges
Lawyers
Librarians
Lifeguards
Linguists
Machinists
Mail Carriers
Mathematicians
Mechanics
Musicians
Nurses
Painters
Pilots
Plumbers
Paramedics
Tailors
Teachers
Software Developers
Actuaries
Paleontologists
Welders
Paralegals
Animators
the Arizona Diamondbacks
the Atlanta Braves
the Baltimore Orioles
the Boston Red Sox
the Chicago Cubs
the Chicago White Sox
the Cincinnati Reds
the Cleveland Guardians
the Colorado Rockies
the Detroit Tigers
the Houston Astros
the Kansas City Royals
the Los Angeles Angels
the Los Angeles Dodgers
the Miami Marlins
the Milwaukee Brewers
the Minnesota Twins
the New York Mets
the New York Yankees
the Oakland Athletics
the Philadelphia Phillies
the Pittsburgh Pirates
the San Diego Padres
the San Francisco Giants
the Seattle Mariners
the St. Louis Cardinals
the Tampa Bay Rays
the Texas Rangers
the Toronto Blue Jays
the Washington Nationals
the Arizona Cardinals
the Atlanta Falcons
the Carolina Panthers
the Chicago Bears
the Dallas Cowboys
the Green Bay Packers
the Detroit Lions
the Los Angeles Rams
the New York Jets
the Washington Commanders
the Cleveland Browns
the Kansas City Chiefs
"""


In [54]:
items_to_like = [item for item in items_to_like.split("\n") if item.strip() not in [""]]
import random
random.seed(42)
random.shuffle(items_to_like)

## Setup matrix


In [55]:
def generate_random_qrel_matrix(num_docs, num_queries, k_per_query):
    """Generate random qrel matrix with k relevant docs per query."""
    if num_docs < k_per_query and num_queries > 0 and k_per_query > 0:
        print(f"Error: num_docs ({num_docs}) must be >= k_per_query ({k_per_query})")
        return None

    qrels_matrix = np.zeros((num_queries, num_docs), dtype=int)
    for i in range(num_queries):
        if k_per_query > 0:
            relevant_docs = random.sample(range(num_docs), k_per_query)
            qrels_matrix[i, relevant_docs] = 1
        else:
            relevant_docs = []


    return qrels_matrix

def generate_cycle_qrel_matrix(num_docs, num_queries, k_per_query):
    """
    Generates a qrel matrix with a single large cyclical pattern.

    For each query i, it marks documents i to i + k-1 as relevant,
    wrapping around the document indices.

    e.g., for num_docs=3, num_queries=3, k=2:
    Q1 -> D1, D2  [1, 1, 0]
    Q2 -> D2, D3  [0, 1, 1]
    Q3 -> D3, D1  [1, 0, 1]

    Args:
        num_docs (int): The total number of documents.
        num_queries (int): The total number of queries.
        k_per_query (int): The number of relevant documents for each query.

    Returns:
        np.ndarray: A (num_queries x num_docs) binary matrix.
    """
    if num_queries > num_docs:
        print(f"Warning: For the cycle pattern, it's recommended that num_queries ({num_queries}) <= num_docs ({num_docs}).")

    qrels_matrix = np.zeros((num_queries, num_docs), dtype=int)
    for i in range(num_queries):
        for j in range(k_per_query):
            doc_index = (i + j) % num_docs
            qrels_matrix[i, doc_index] = 1
    return qrels_matrix

def generate_disjoint_qrel_matrix(num_docs, num_queries, k_per_query):
    """
    Generates a qrel matrix with disjoint sets of relevant documents.

    Each query is relevant to a unique block of k documents.

    e.g., for k=2:
    Q1 -> D1, D2    [1, 1, 0, 0, 0, 0]
    Q2 -> D3, D4    [0, 0, 1, 1, 0, 0]
    Q3 -> D5, D6    [0, 0, 0, 0, 1, 1]

    Args:
        num_docs (int): The total number of documents.
        num_queries (int): The total number of queries.
        k_per_query (int): The number of relevant documents for each query.

    Returns:
        np.ndarray: A (num_queries x num_docs) binary matrix or None if impossible.
    """
    required_docs = num_queries * k_per_query
    if required_docs > num_docs:
        print(f"Error: Not enough documents ({num_docs}) for disjoint sets. "
              f"Need at least {required_docs} for {num_queries} queries with k={k_per_query}.")
        return None

    qrels_matrix = np.zeros((num_queries, num_docs), dtype=int)
    for i in range(num_queries):
        start_idx = i * k_per_query
        end_idx = start_idx + k_per_query
        qrels_matrix[i, start_idx:end_idx] = 1
    return qrels_matrix


def generate_dense_qrel_matrix(num_docs, num_queries, k_per_query):
    """
    Generates a qrel matrix from a dense subset of documents.

    It finds the smallest number of documents 'n' such that nCk >= num_queries.
    It then creates queries for every combination of k documents from that
    set of 'n', up to the num_queries limit. All other documents
    (from n to num_docs) are not part of any relevance judgments.

    Args:
        num_docs (int): The total number of documents.
        num_queries (int): The maximum number of queries to generate.
        k_per_query (int): The number of relevant items per query (the 'k' in nCk).

    Returns:
        np.ndarray: A (num_queries x num_docs) binary matrix or None if impossible.
    """
    if k_per_query <= 0:
        return np.zeros((num_queries, num_docs), dtype=int)

    # 1. Find the smallest n such that nCk >= num_queries
    n = k_per_query
    while True:
        try:
            # Calculate combinations, if n < k, math.comb raises ValueError
            num_combinations = math.comb(n, k_per_query)
            if num_combinations >= num_queries:
                break
        except ValueError:
            pass # n is still smaller than k
        n += 1
        if n > num_docs * 2: # Heuristic break to prevent infinite loops
            print(f"Error: Could not find a suitable 'n' for the dense matrix.")
            return None

    if n > num_docs:
        print(f"Error: Not enough documents ({num_docs}) to generate dense set. "
              f"Need at least {n} documents to create {num_queries} queries with k={k_per_query}.")
        return None

    print(f"Using the first {n} documents to generate dense combinations.")

    # 2. Generate all combinations of k indices from the range [0, n-1]
    doc_indices = range(n)
    all_combinations = list(itertools.combinations(doc_indices, k_per_query))

    # We might generate more combinations than requested, so cap at num_queries
    actual_num_queries = min(num_queries, len(all_combinations))
    qrels_matrix = np.zeros((actual_num_queries, num_docs), dtype=int)

    # 3. Create the qrel matrix from the combinations
    for i in range(actual_num_queries):
        combination = all_combinations[i]
        qrels_matrix[i, list(combination)] = 1

    return qrels_matrix


def calculate_min_num_queries(num_queries, num_docs, k_per_query):
  n = k_per_query
  while True:
      try:
          # Calculate combinations, if n < k, math.comb raises ValueError
          num_combinations = math.comb(n, k_per_query)
          if num_combinations >= num_queries:
              break
      except ValueError:
          pass # n is still smaller than k
      n += 1
      if n > num_docs * 2: # Heuristic break to prevent infinite loops
          print(f"Error: Could not find a suitable 'n' for the dense matrix.")
          return None
  return num_combinations, n

In [56]:
def generate_ir_dataset(initial_items, qrels_matrix, min_items=10):
    """
    Procedurally generates an IR dataset including a corpus, queries, and a qrels matrix.

    Args:
        initial_items (list): A list of initial items to be placed in the qrels.
        qrels_matrix (np.ndarray): A binary matrix (M x N) where M is the number of queries
                                   and N is the number of documents (people). A '1' indicates
                                   a relevance judgment.
        min_items (int): The minimum number of additional items to assign to each person.
    """
    num_queries, num_people = qrels_matrix.shape
    people_items = {f"person_{i}": set() for i in range(num_people)}

    # --- 1. Prepare Items & Queries ---
    # This section generates the items for each query.
    available_items_for_queries = list(initial_items)
    random.shuffle(available_items_for_queries)
    if len(available_items_for_queries) < num_queries:
        raise ValueError(f"Not enough unique items ({len(available_items_for_queries)}) for {num_queries} queries.")
    queries = {f"query_{i}": [available_items_for_queries.pop()] for i in range(num_queries)}
    used_initial_items_for_queries = {item[0] for item in queries.values()}
    conflict_pairs = set() # No conflicts for s=1


    # --- 2. Populate Qrels and Assign Query Items ---
    # Assign the query items to all people who are marked as relevant.
    print("Assigning query items to relevant people...")
    for i in tqdm.tqdm(range(num_queries)):
        query_id = f"query_{i}"
        items_for_query = queries[query_id]
        relevant_people_indices = np.where(qrels_matrix[i] == 1)[0]
        for person_idx in relevant_people_indices:
            person_id = f"person_{person_idx}"
            people_items[person_id].update(items_for_query)

    # --- 3. Assign Additional Items (Conflict-Aware) ---
    # For every person, add `min_items` more items, ensuring no conflicts are created.
    unused_initial_items = [item for item in initial_items if item not in used_initial_items_for_queries]
    if unused_initial_items:
      print(f"Assigning additional {min_items} items to each person (conflict-aware)...")
      for person_idx in tqdm.tqdm(range(num_people)):
          person_id = f"person_{person_idx}"

          # Find which queries this person is relevant to, to identify their "allowed" conflicts.
          relevant_query_indices = np.where(qrels_matrix[:, person_idx] == 1)[0]
          allowed_conflicts = {frozenset(queries[f"query_{q_idx}"]) for q_idx in relevant_query_indices}
          current_items = len(people_items[person_id])

          items_to_add = set()
          while len(items_to_add) + current_items < min_items:
              # Sample a new item
              new_item = random.choice(unused_initial_items)

              # A person's current items are what they got from qrels + what we've added so far.
              current_and_pending_items = people_items[person_id].union(items_to_add)

              # Fast check: only need to check for new conflicts involving the new_item.
              is_safe_to_add = True
              for existing_item in current_and_pending_items:
                  potential_conflict = frozenset({existing_item, new_item})
                  if potential_conflict in conflict_pairs and potential_conflict not in allowed_conflicts:
                      is_safe_to_add = False
                      break # Conflict found, this item is not safe.

              if is_safe_to_add:
                  items_to_add.add(new_item)

          people_items[person_id].update(items_to_add)
    else:
        print("Warning: No unused initial items available for profile assignment.")

    # Convert sets back to lists for the final output
    people_items_list = {person_id: list(items) for person_id, items in people_items.items()}

    # --- 4. Finalize and Return ---
    qrels = {f"query_{i}": [f"person_{idx}" for idx in np.where(row == 1)[0]] for i, row in enumerate(qrels_matrix)}


    finished_queries = {}
    for query_id, items in queries.items():
        if len(items) > 1:
            item_str = ' and '.join(items)
        else:
            item_str = items[0]
        finished_queries[query_id] = f"Who likes {item_str}?"

    # convert corpus into list with people
    num_users = len(people_items_list)
    generated_names = [
        f"{random.choice(unique_names).capitalize()} {random.choice(unique_surnames).lower().capitalize()}"
        for _ in range(num_users*2)
    ]
    # shuffle the names
    random.shuffle(generated_names)
    # remove duplicates
    generated_names = list(set(generated_names))[:num_users]
    print(f"Generated {len(generated_names)} unique user names for all documents.")

    person_map = {}
    finished_corpus = {}
    for i, (person_id, likes) in enumerate(people_items_list.items()):
        user_name = generated_names[i] # Use index to ensure consistent mapping
        person_map[person_id] = user_name
        # Join all but the last with commas
        all_but_last = ', '.join(likes[:-1])
        # Add 'and' before the last item
        last_item = likes[-1]
        finished_corpus[user_name] = f"{user_name} likes {all_but_last} and {last_item}."

    # now replace the qrels with the user names
    finished_qrels = {}
    for key, value in qrels.items():
        finished_qrels[key] = {person_map[person]: 1 for person in value}

    return {
        "corpus": finished_corpus,
        "queries": finished_queries,
        "qrels": finished_qrels,
    }

## Generate the dataset

In [57]:
# generate the small version. Change this to generate a longer one by switching num_docs to 50k
qrels_small = generate_dense_qrel_matrix(num_docs=46, num_queries=1000, k_per_query=2)
ir_dataset_small = generate_ir_dataset(
    items_to_like,
    qrels_small,
    min_items=45,
)

Using the first 46 documents to generate dense combinations.
Assigning query items to relevant people...


100%|██████████| 1000/1000 [00:00<00:00, 180508.87it/s]


Assigning additional 45 items to each person (conflict-aware)...


100%|██████████| 46/46 [00:00<00:00, 8095.75it/s]

Generated 46 unique user names for all documents.





## Inpsect the data

In [58]:
# Display some examples from the generated dataset
print("Examples from the generated dataset:")
print("-" * 30)

# Display a few queries and their associated items
print("Sample Queries, Relevant People, and Associated Items:")

for i, (query_id, items) in enumerate(ir_dataset_small['queries'].items()):
    if i < 3: # Display first 3 queries
        print(f"Query: {query_id} - Question: {items}")
        # Find relevant people for this query from the qrels dictionary
        relevant_people_names = list(ir_dataset_small['qrels'][query_id].keys())
        print(f"  Relevant People: {relevant_people_names}")

        # Display the items for each relevant person
        if relevant_people_names:
            for relevant_person_name in relevant_people_names:
                 # Get the original person_id using the name_to_person_id_map
                 person_items = ir_dataset_small["corpus"].get(relevant_person_name)
                 print(f"    {relevant_person_name}: {person_items}")

        else:
             print("  No relevant people found for this query.")

        print("-" * 10)
    else:
        break

Examples from the generated dataset:
------------------------------
Sample Queries, Relevant People, and Associated Items:
Query: query_0 - Question: Who likes Joshua Trees?
  Relevant People: ['Msughter Perez', 'Antonio Francis']
    Msughter Perez: Msughter Perez likes Licorice, Soy Sauce, Alligators, White Pepper, Halibut, Glaciers, Symphonies, Fables, Mosaic, Asymmetry, Horror Literature, Quokkas, Alternative Rock, Cartography, Slide Rules, Hydrangeas, Candle Making, Crocheting, Scuba Diving, Sapphires, Joshua Trees, The Aztec Empire, Patterns, Elm Trees, Traveling, Pansies, Disco Music, Barley, Cheerios, Pangolins, River Otters, Giant Isopods, Caracals, Snow Leopards, Chairs, Bassoons, Tapirs, Spinach, Creativity, Cards Against Humanity, Oceanic Trenches, Shasta Daisies, Parakeets, Praying Mantises and Limes.
    Antonio Francis: Antonio Francis likes Mussels, Karate, Puzzle Video Games, Paramedics, Apple Cider, Stand-up Comedy, Poblano Peppers, Dragon Fruits, Public Health, Rice 