# Synthetic Data Production

This first code cell produces different csvs for that you can merge if you want or go to the second code cell which makes an entirely full connected dataset as a single csv

In [2]:
import pandas as pd
import random
from faker import Faker

fake = Faker()

# Define categories and sample brands
categories = ["Electronics", "Clothing", "Home", "Books", "Toys"]
brands = {
    "Electronics": ["Apple", "Samsung", "Sony", "Dell", "HP"],
    "Clothing": ["Nike", "Adidas", "Levi's", "Gucci", "Puma"],
    "Home": ["IKEA", "Philips", "Hamilton Beach", "Dyson", "Bosch"],
    "Books": ["Penguin", "HarperCollins", "Simon & Schuster", "Oxford", "Pearson"],
    "Toys": ["LEGO", "Hasbro", "Mattel", "Funko", "Nintendo"]
}

# Generate Product Metadata
def generate_product_data(num_entries=100000):
    data = []
    for i in range(1, num_entries + 1):
        category = random.choice(categories)
        product_id = f"P{i:06d}"
        title = f"{fake.word().capitalize()} {category}"
        brand = random.choice(brands[category])
        price = round(random.uniform(5, 2000), 2)
        ratings = round(random.uniform(1, 5), 1)
        stock = random.randint(0, 100)
        description = fake.sentence(nb_words=10)
        keywords = ", ".join(fake.words(nb=5))

        data.append([product_id, title, category, brand, price, ratings, stock, description, keywords])

    return pd.DataFrame(data, columns=[
        "Product_ID", "Title", "Category", "Brand", "Price", "Ratings", "Stock", "Description", "Keywords"
    ])

# Generate User Data, Purchase History, and Additional Interactions
def generate_user_data(num_users=50000, num_purchases=200000):
    users = []
    purchases = []
    reviews = []
    bids = []
    browsing = []
    wishlist = []

    for i in range(1, num_users + 1):
        user_id = f"U{i:05d}"
        name = fake.name()
        email = fake.email()
        location = fake.city()
        users.append([user_id, name, email, location])

    for _ in range(num_purchases):
        user_id = f"U{random.randint(1, num_users):05d}"
        product_id = f"P{random.randint(1, 100000):06d}"
        purchase_date = fake.date_this_year()
        purchases.append([user_id, product_id, purchase_date])

        if random.random() > 0.5:  # 50% chance of leaving a review
            review_text = fake.sentence(nb_words=15)
            rating = round(random.uniform(1, 5), 1)
            reviews.append([user_id, product_id, rating, review_text])

        if random.random() > 0.3:  # 30% chance of bidding
            bid_amount = round(random.uniform(10, 2000), 2)
            bid_date = fake.date_this_year()
            bids.append([user_id, product_id, bid_amount, bid_date])

        if random.random() > 0.4:  # 40% chance of adding to wishlist
            wishlist.append([user_id, product_id])

        if random.random() > 0.6:  # 60% chance of viewing product
            browsing.append([user_id, product_id, fake.date_this_year()])

    user_df = pd.DataFrame(users, columns=["User_ID", "Name", "Email", "Location"])
    purchase_df = pd.DataFrame(purchases, columns=["User_ID", "Product_ID", "Purchase_Date"])
    review_df = pd.DataFrame(reviews, columns=["User_ID", "Product_ID", "Rating", "Review"])
    bid_df = pd.DataFrame(bids, columns=["User_ID", "Product_ID", "Bid_Amount", "Bid_Date"])
    wishlist_df = pd.DataFrame(wishlist, columns=["User_ID", "Product_ID"])
    browsing_df = pd.DataFrame(browsing, columns=["User_ID", "Product_ID", "View_Date"])

    return user_df, purchase_df, review_df, bid_df, wishlist_df, browsing_df

# Generate and save large datasets
product_df = generate_product_data(100000)
user_df, purchase_df, review_df, bid_df, wishlist_df, browsing_df = generate_user_data(50000, 200000)

product_df.to_csv("synthetic_product_data_large.csv", index=False)
user_df.to_csv("synthetic_user_data_large.csv", index=False)
purchase_df.to_csv("synthetic_purchase_data_large.csv", index=False)
review_df.to_csv("synthetic_reviews_large.csv", index=False)
bid_df.to_csv("synthetic_bids_large.csv", index=False)
wishlist_df.to_csv("synthetic_wishlist_large.csv", index=False)
browsing_df.to_csv("synthetic_browsing_large.csv", index=False)

print("Fully connected datasets generated and saved!")

Fully connected datasets generated and saved!


In [3]:
import pandas as pd
import random
from faker import Faker

fake = Faker()

# Define categories and sample brands
categories = ["Electronics", "Clothing", "Home", "Books", "Toys"]
brands = {
    "Electronics": ["Apple", "Samsung", "Sony", "Dell", "HP"],
    "Clothing": ["Nike", "Adidas", "Levi's", "Gucci", "Puma"],
    "Home": ["IKEA", "Philips", "Hamilton Beach", "Dyson", "Bosch"],
    "Books": ["Penguin", "HarperCollins", "Simon & Schuster", "Oxford", "Pearson"],
    "Toys": ["LEGO", "Hasbro", "Mattel", "Funko", "Nintendo"]
}

# Generate synthetic dataset with all features in one CSV
def generate_full_dataset(num_users=50000, num_products=100000, num_interactions=200000):
    data = []
    
    for _ in range(num_interactions):
        user_id = f"U{random.randint(1, num_users):05d}"
        product_id = f"P{random.randint(1, num_products):06d}"
        
        category = random.choice(categories)
        brand = random.choice(brands[category])
        title = f"{fake.word().capitalize()} {category}"
        price = round(random.uniform(5, 2000), 2)
        ratings = round(random.uniform(1, 5), 1)
        stock = random.randint(0, 100)
        description = fake.sentence(nb_words=10)
        keywords = ", ".join(fake.words(nb=5))
        
        purchase_date = fake.date_this_year() if random.random() > 0.5 else None
        review_text = fake.sentence(nb_words=15) if random.random() > 0.5 else None
        rating = round(random.uniform(1, 5), 1) if review_text else None
        bid_amount = round(random.uniform(10, 2000), 2) if random.random() > 0.3 else None
        bid_date = fake.date_this_year() if bid_amount else None
        view_date = fake.date_this_year() if random.random() > 0.6 else None
        wishlist_added = random.choice([True, False]) if random.random() > 0.4 else None
        
        data.append([user_id, product_id, title, category, brand, price, ratings, stock, description, keywords,
                     purchase_date, review_text, rating, bid_amount, bid_date, view_date, wishlist_added])
    
    columns = ["User_ID", "Product_ID", "Title", "Category", "Brand", "Price", "Ratings", "Stock", "Description", "Keywords",
               "Purchase_Date", "Review", "Review_Rating", "Bid_Amount", "Bid_Date", "View_Date", "Wishlist_Added"]
    
    return pd.DataFrame(data, columns=columns)

# Generate the dataset
dataset = generate_full_dataset()

# Save to CSV
dataset.to_csv("synthetic_ebay_like_dataset.csv", index=False)

print("Fully connected eBay-like dataset generated and saved!")

Fully connected eBay-like dataset generated and saved!
