### Goal
Generate user data that represents 
- age, gender, location, and income
- seasonal demand

### Models
User
- id
- name
- gender
- age
- location

Transaction
- id
- user id
- product id
- created at
- price
- quantity

In [46]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

In [47]:
products_df = pd.read_csv('product.csv')
belongs_df = pd.read_csv('belongs.csv')

In [48]:
joined_df = pd.merge(left=products_df, right=belongs_df, left_on=['id'], right_on=['product_id'])
joined_df = joined_df.drop(columns=['Unnamed: 0', 'product_id'])
joined_df = joined_df.groupby(['id', 'title', 'current_price', 'is_active', 'total_quantity']).agg(lambda x: x.tolist())
joined_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,collection
id,title,current_price,is_active,total_quantity,Unnamed: 5_level_1
0,Sesame Paste Ramen 5 Pack,12.9,True,300,"[public-goods-food, food, public-goods, dry-so..."
1,Organic Linden Flower Raw Honey,9.9,True,200,"[public-goods-food, food, kitchen-dining, publ..."
2,Seaweed Snacks,9.9,True,200,"[public-goods-food, food, public-goods, home-e..."
3,Veggie Chips,6.9,True,200,"[public-goods-food, food, public-goods]"
4,Organic Wildflower Raw Honey,18.9,True,300,"[public-goods-food, home-essentials-new-arrivals]"
...,...,...,...,...,...
1556,Cord Hair Band 2 Pieces Set,4.9,True,288,"[hair-care, makeup]"
1557,"Pine Shelf Unit - Cross Bar - Large (33.9"")",15.0,True,288,[pine-shelving-unit]
1558,SUS Shelving Unit - Walnut - Regular - Medium,350.0,True,288,[home-essentials-new-arrivals]
1559,Heatproof Glass Pot - 25.3 oz,29.9,True,288,[glassware]


In [49]:
num_users = 10000

def generate_user_profile(user_id):
    age = random.randint(18, 65)
    gender = random.choices(['Male', 'Female'], weights=(40, 60))
    income = random.choice(['Low', 'Medium', 'High'])
    location = random.choices(['New York', 'Boston', 'Portland'], weights=(50, 30, 20))
    
    # Simulate user preferences based on demographics
    preferences = {
        'stationery': random.random(),
        'travel': random.random(),
        'home': random.random(),
        'skin-care': random.random(),
        'fashion': random.random()
    }
    
    return {
        'user_id': user_id,
        'age': age,
        'gender': gender,
        'income': income,
        'location': location,
        'preferences': preferences
    }

# Create a list of user profiles
users = [generate_user_profile(i) for i in range(1, num_users + 1)]

# Convert to DataFrame
users_df = pd.DataFrame(users)
users_df


Unnamed: 0,user_id,age,gender,income,location,preferences
0,1,43,[Female],Medium,[Portland],"{'stationery': 0.43164236501761877, 'travel': ..."
1,2,40,[Male],Low,[Portland],"{'stationery': 0.20691245304169248, 'travel': ..."
2,3,24,[Female],Low,[New York],"{'stationery': 0.7799789086200667, 'travel': 0..."
3,4,24,[Female],High,[Boston],"{'stationery': 0.8085985726666195, 'travel': 0..."
4,5,21,[Female],Low,[Boston],"{'stationery': 0.5195192464244853, 'travel': 0..."
...,...,...,...,...,...,...
9995,9996,49,[Male],Medium,[Portland],"{'stationery': 0.6233782021777211, 'travel': 0..."
9996,9997,47,[Female],Low,[Portland],"{'stationery': 0.6701742600221068, 'travel': 0..."
9997,9998,22,[Male],High,[New York],"{'stationery': 0.469384964302511, 'travel': 0...."
9998,9999,47,[Male],Low,[New York],"{'stationery': 0.978274611111294, 'travel': 0...."


In [50]:
STATIONERY = {
    "desk-accessories", "notebook", "binders-looseleaf-paper", "sticky-notes", 
    "office-supplies", "stationery", "calendar-planners", "pens", 
    "paper-goods", "card-memo", "markers-highlighters", "pen-pencils", 
    "pencils-erasers"
}

HOME = {
    "sofas-bean-bags", "bedding", "cleaning-tools", "towels-and-bathmats", 
    "housekeeping-goods", "pine-shelving-unit", "home-accessories", 
    "closet-organizer", "bed-pads-fitted-sheets", "home-clearance", 
    "laundry", "everyday-tableware", "kitchen-utensils", "home-essentials-new-arrivals", 
    "home", "bathroom", "shelves", "pillows-pillowcases", "hooks-hangers", 
    "sus-steel-shelf-set", "tableware", "reed-diffusers", "aroma-fragrances", 
    "duvets-duvet-covers", "acrylic-storages", "sus-shelving-unit-light-gray", 
    "bamboo-furniture-storage-shelves-racks", "kitchen-organizers", "chairs", 
    "bathroom-organizer", "acacia-dishware", "storage-organizers", 
    "stacking-shelves", "lounge-chairs", "sus-shelving-unit-oak", 
    "albums-photo-frame", "cushions-cushion-covers", "dining-utensils", 
    "stools-benches", "cleaning", "sofas", "kitchen-dining", 
    "kitchen-appliances", "office-storage", "dining-tables", "dining-chairs", 
    "glassware"
}

TRAVEL = {
    "new-hard-shell-check-in-suitcases", "passport-safety-cases", 
    "travel-accessories", "travel-essentials", "hard-shell-carry-on-suitcases-36l", 
    "63l-suitcase", "travel-containers", "travel", "packing-organizers", 
    "luggage", "88l-suitcase", "toiletry-cases"
}

SKIN_CARE = {
    "makeup-accessories", "muji-skincare", "moisturising", "personal-care", 
    "health-beauty", "public-goods-personal-care-home-products", 
    "hair-care", "essential-oils", "toning-water", "cleansing-oil-soap"
}

FASHION = {
    "garment-organization-cases", "mens-denim", "t-shirts-shorts-sale", 
    "womens-innerwear", "womens-loungewear", "mens-t-shirts", "womens-shorts-skirts", 
    "womens-tops-shirts", "womens-clearance", "accessories", "mens-tops", 
    "women", "mens-clearance", "shoes", "sandals", "mens-innerwear", 
    "dresses", "unisex", "womens-outerwear", "pajamas-loungewear", 
    "arts-fashion", "mens-bottom", "umbrellas-raincoats", "womens-tops", 
    "mens-outerwear", "shirt-polos-sale", "apparel-sale", "activewear", 
    "socks", "workwear", "hemp-linen-clothing", "bags", "spring-scarves", 
    "mens", "hats", "mens-shirts-polos", "slippers", "mens-shorts", 
    "womens-t-shirts", "mens-loungewear", "womens-denim", "womens-bottoms", 
    "health"
}


In [51]:
# Mapping of product attributes to user preferences
def calculate_purchase_likelihood(user_preferences, product_collection):
    product_collection = set(product_collection)
    likelihood = 0
    if product_collection.intersection(STATIONERY):
        likelihood += user_preferences['stationery']
    if product_collection.intersection(TRAVEL):
        likelihood += user_preferences['travel']
    if product_collection.intersection(HOME):
        likelihood += user_preferences['home']
    if product_collection.intersection(SKIN_CARE):
        likelihood += user_preferences['skin-care']
    if product_collection.intersection(FASHION):
        likelihood += user_preferences['fashion']
    return likelihood

# Calculating purchase likelihood for a single user and product
user_preferences = users_df.iloc[0]['preferences']
product_collection = joined_df.iloc[20]['collection']
purchase_likelihood = calculate_purchase_likelihood(user_preferences, product_collection)
print(user_preferences)
print(product_collection)
print(purchase_likelihood)

{'stationery': 0.43164236501761877, 'travel': 0.9181176713843808, 'home': 0.3723683269421547, 'skin-care': 0.34784777945428724, 'fashion': 0.5506583250400211}
['bedding', 'back-to-school-dorm-room-essentials', 'duvets-duvet-covers', 'limited-time-offers', 'home']
0.3723683269421547
