### Goal
Generate user data that represents 
- age, gender, location, and income
- seasonal demand

### Models
User
- id
- name
- gender
- age
- location

Transaction
- id
- user id
- product id
- created at
- price
- quantity

In [37]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

In [38]:
products_df = pd.read_csv('product.csv')
belongs_df = pd.read_csv('belongs.csv')

In [39]:
joined_df = pd.merge(left=products_df, right=belongs_df, left_on=['id'], right_on=['product_id'])
joined_df = joined_df.drop(columns=['Unnamed: 0', 'product_id'])
joined_df = joined_df.groupby(['id', 'title', 'current_price', 'is_active', 'total_quantity']).agg(lambda x: x.tolist())
joined_df.reset_index(inplace=True)
joined_df

Unnamed: 0,id,title,current_price,is_active,total_quantity,collection
0,0,Sesame Paste Ramen 5 Pack,12.9,True,300,"[public-goods-food, food, public-goods, dry-so..."
1,1,Organic Linden Flower Raw Honey,9.9,True,200,"[public-goods-food, food, kitchen-dining, publ..."
2,2,Seaweed Snacks,9.9,True,200,"[public-goods-food, food, public-goods, home-e..."
3,3,Veggie Chips,6.9,True,200,"[public-goods-food, food, public-goods]"
4,4,Organic Wildflower Raw Honey,18.9,True,300,"[public-goods-food, home-essentials-new-arrivals]"
...,...,...,...,...,...,...
1554,1556,Cord Hair Band 2 Pieces Set,4.9,True,288,"[hair-care, makeup]"
1555,1557,"Pine Shelf Unit - Cross Bar - Large (33.9"")",15.0,True,288,[pine-shelving-unit]
1556,1558,SUS Shelving Unit - Walnut - Regular - Medium,350.0,True,288,[home-essentials-new-arrivals]
1557,1559,Heatproof Glass Pot - 25.3 oz,29.9,True,288,[glassware]


In [40]:
num_users = 10000

def generate_user_profile(user_id):
    age = random.randint(18, 65)
    gender = random.choices(['Male', 'Female'], weights=(40, 60))
    income = random.choice(['Low', 'Medium', 'High'])
    location = random.choices(['New York', 'Boston', 'Portland'], weights=(50, 30, 20))
    
    # Simulate user preferences based on demographics
    preferences = {
        'stationery': random.random(),
        'travel': random.random(),
        'home': random.random(),
        'skin-care': random.random(),
        'fashion': random.random()
    }
    
    return {
        'user_id': user_id,
        'age': age,
        'gender': gender,
        'income': income,
        'location': location,
        'preferences': preferences
    }

# Create a list of user profiles
users = [generate_user_profile(i) for i in range(1, num_users + 1)]

# Convert to DataFrame
users_df = pd.DataFrame(users)
users_df


Unnamed: 0,user_id,age,gender,income,location,preferences
0,1,34,[Female],High,[New York],"{'stationery': 0.8668527815273029, 'travel': 0..."
1,2,28,[Female],High,[Boston],"{'stationery': 0.5944095047528185, 'travel': 0..."
2,3,46,[Male],Low,[New York],"{'stationery': 0.5352067173374587, 'travel': 0..."
3,4,29,[Male],Medium,[New York],"{'stationery': 0.11586929827594161, 'travel': ..."
4,5,46,[Male],High,[New York],"{'stationery': 0.6589154011178008, 'travel': 0..."
...,...,...,...,...,...,...
9995,9996,43,[Female],Medium,[Portland],"{'stationery': 0.5951432075328748, 'travel': 0..."
9996,9997,51,[Male],High,[New York],"{'stationery': 0.02142837337182557, 'travel': ..."
9997,9998,57,[Male],Medium,[Portland],"{'stationery': 0.6241622697325456, 'travel': 0..."
9998,9999,34,[Male],High,[New York],"{'stationery': 0.09535373196317454, 'travel': ..."


In [41]:
STATIONERY = {
    "desk-accessories", "notebook", "binders-looseleaf-paper", "sticky-notes", 
    "office-supplies", "stationery", "calendar-planners", "pens", 
    "paper-goods", "card-memo", "markers-highlighters", "pen-pencils", 
    "pencils-erasers"
}

HOME = {
    "sofas-bean-bags", "bedding", "cleaning-tools", "towels-and-bathmats", 
    "housekeeping-goods", "pine-shelving-unit", "home-accessories", 
    "closet-organizer", "bed-pads-fitted-sheets", "home-clearance", 
    "laundry", "everyday-tableware", "kitchen-utensils", "home-essentials-new-arrivals", 
    "home", "bathroom", "shelves", "pillows-pillowcases", "hooks-hangers", 
    "sus-steel-shelf-set", "tableware", "reed-diffusers", "aroma-fragrances", 
    "duvets-duvet-covers", "acrylic-storages", "sus-shelving-unit-light-gray", 
    "bamboo-furniture-storage-shelves-racks", "kitchen-organizers", "chairs", 
    "bathroom-organizer", "acacia-dishware", "storage-organizers", 
    "stacking-shelves", "lounge-chairs", "sus-shelving-unit-oak", 
    "albums-photo-frame", "cushions-cushion-covers", "dining-utensils", 
    "stools-benches", "cleaning", "sofas", "kitchen-dining", 
    "kitchen-appliances", "office-storage", "dining-tables", "dining-chairs", 
    "glassware"
}

TRAVEL = {
    "new-hard-shell-check-in-suitcases", "passport-safety-cases", 
    "travel-accessories", "travel-essentials", "hard-shell-carry-on-suitcases-36l", 
    "63l-suitcase", "travel-containers", "travel", "packing-organizers", 
    "luggage", "88l-suitcase", "toiletry-cases"
}

SKIN_CARE = {
    "makeup-accessories", "muji-skincare", "moisturising", "personal-care", 
    "health-beauty", "public-goods-personal-care-home-products", 
    "hair-care", "essential-oils", "toning-water", "cleansing-oil-soap"
}

FASHION = {
    "garment-organization-cases", "mens-denim", "t-shirts-shorts-sale", 
    "womens-innerwear", "womens-loungewear", "mens-t-shirts", "womens-shorts-skirts", 
    "womens-tops-shirts", "womens-clearance", "accessories", "mens-tops", 
    "women", "mens-clearance", "shoes", "sandals", "mens-innerwear", 
    "dresses", "unisex", "womens-outerwear", "pajamas-loungewear", 
    "arts-fashion", "mens-bottom", "umbrellas-raincoats", "womens-tops", 
    "mens-outerwear", "shirt-polos-sale", "apparel-sale", "activewear", 
    "socks", "workwear", "hemp-linen-clothing", "bags", "spring-scarves", 
    "mens", "hats", "mens-shirts-polos", "slippers", "mens-shorts", 
    "womens-t-shirts", "mens-loungewear", "womens-denim", "womens-bottoms", 
    "health"
}


In [42]:
# Mapping of product attributes to user preferences
def calculate_purchase_likelihood(user_preferences, product_collection):
    product_collection = set(product_collection)
    likelihood = 0
    if product_collection.intersection(STATIONERY):
        likelihood += user_preferences['stationery']
    if product_collection.intersection(TRAVEL):
        likelihood += user_preferences['travel']
    if product_collection.intersection(HOME):
        likelihood += user_preferences['home']
    if product_collection.intersection(SKIN_CARE):
        likelihood += user_preferences['skin-care']
    if product_collection.intersection(FASHION):
        likelihood += user_preferences['fashion']
    return likelihood

# Calculating purchase likelihood for a single user and product
user_preferences = users_df.iloc[0]['preferences']
product_collection = joined_df.iloc[20]['collection']
purchase_likelihood = calculate_purchase_likelihood(user_preferences, product_collection)
print(user_preferences)
print(product_collection)
print(purchase_likelihood)

{'stationery': 0.8668527815273029, 'travel': 0.17312898754451655, 'home': 0.1531862477719963, 'skin-care': 0.7495482620919918, 'fashion': 0.19480663204139237}
['bedding', 'back-to-school-dorm-room-essentials', 'duvets-duvet-covers', 'limited-time-offers', 'home']
0.1531862477719963


In [43]:
# Time range for transaction simulation
start_date = datetime(2023, 1, 1)
end_date = datetime(2023, 12, 31)

# Helper function to generate random dates
def random_date(start, end):
    return start + timedelta(days=random.randint(0, (end - start).days))

# Simulate transactions for each user
error_product_id = []
transactions = []
for index, user in users_df.iterrows():
    num_transactions = random.randint(1, 20)
    for _ in range(num_transactions):
        # Randomly choose a product
        product = products_df.sample(1).iloc[0]
        
        # Calculate likelihood of purchase
        try:
            product_collection = joined_df[joined_df['id'] == product.id]['collection']
            product_collection = product_collection.to_list()[0]
        except:
            error_product_id.append(product.id)
        purchase_likelihood = calculate_purchase_likelihood(user['preferences'], product_collection)
        
        # Simulate purchase based on likelihood
        if random.random() < purchase_likelihood:  # A higher likelihood increases purchase probability
            date = random_date(start_date, end_date)
            quantity = random.randint(1, 5)
            total_price = product['current_price'] * quantity
            
            # Create a transaction record
            transactions.append({
                'user_id': user['user_id'],
                'product_id': product['id'],
                'date': date,
                'quantity': quantity,
                'total_price': round(total_price, 2)
            })

# Convert transactions to DataFrame
transactions_df = pd.DataFrame(transactions)
print('Product that failed:', set(error_product_id))
transactions_df


Product that failed: {np.int64(226), np.int64(106)}


Unnamed: 0,user_id,product_id,date,quantity,total_price
0,1,823,2023-04-27,4,3.60
1,1,316,2023-01-01,5,54.50
2,1,31,2023-11-24,3,11.70
3,2,837,2023-12-23,1,1.90
4,2,1207,2023-02-24,1,39.99
...,...,...,...,...,...
55969,10000,588,2023-07-15,1,14.90
55970,10000,1148,2023-08-14,2,7.80
55971,10000,988,2023-05-07,1,79.99
55972,10000,628,2023-11-04,2,39.80


In [44]:
transactions_df.to_csv('transaction.csv', index_label="id")