# Generating synthetic data notebook

**Author**: D.Kufel

**Date**: 08/21/19

In this notebook we generate the synthetic data for the implementation of the online stationary algorithm. The data may be generated using one of three methods: beta distribution, probability distribution over customer types (and no preference for items), or probability distribution over customer types and products. For beta distribution the probabilities of buying of the particular item within a given customer type are probed from beta distribution with potentially different parameters for each customer type. The codes for second and third method are similar to the first one so will not be commented throughly.

In [8]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt

## Method 1 - beta distribution

### Dataset containing past shoppings of the customers

In [9]:
n=200 #number of customers in the past data
d=4 #number of items
m=5 #number of customer types
np.random.seed(51)

#parameters for the shape of the beta distribution we probe
alpha=[4,3,2,2] 
beta=[2,2,3,4]
ptypes=np.random.beta(alpha[0],beta[0],size=m).reshape(m,1) #probed pdf for the first customer type
for item in range(0,d-1):
    y=np.random.beta(alpha[item],beta[item],size=m).reshape(m,1) #probed pdf for the remaining customer types
    ptypes=np.hstack((ptypes,y)) #creating the arbitrary-sized preference matrix which we will later probe

finalarray=np.zeros(d)
typelist=[]
#arrival_rates=[0.4,0.2,0.2,0.1,0.1]
arrival_rates=[0.2,0.2,0.2,0.2,0.2] #arrival rates for different customer types
customertypelist=np.arange(0,m) 
for customer in range(0,n):
    currenttype=np.random.choice(customertypelist,p=arrival_rates) #probing the arrival rates distribution
    typelist.append(currenttype) 
    add=np.array([])
    #below we find the rewards: {0,1} with probabilities as defined above
    for product in range(0,d):
        addh=np.random.choice(2,p=[1-ptypes[currenttype][product],ptypes[currenttype][product]]) 
        add=np.hstack((add,addh))
    finalarray=np.vstack((finalarray,add)) #creating the total reward matrix

finalarray=np.delete(finalarray,0,axis=0)
typelist=np.array(typelist)

In [10]:
np.save('pastcustomerdata_beta_rewards',finalarray)
np.save('pastcustomerdata_beta_customertype',typelist)

### Live dataset the algorithm is supposed to learn

In [11]:
new_n=10000 #number of customers which will arrive
d=4 #number of items
m=5 #number of customer types
#parameters for the shape of the probability distribution
new_alpha=[4,3,2,2] 
new_beta=[2,2,3,4]
np.random.seed(71)

new_ptypes=np.random.beta(new_alpha[0],new_beta[0],size=m).reshape(m,1)
for item in range(0,d-1):
    new_y=np.random.beta(new_alpha[item],new_beta[item],size=m).reshape(m,1)
    new_ptypes=np.hstack((new_ptypes,new_y))

new_finalarray=np.zeros(d)
new_typelist=[]
new_arrival_rates=np.copy(arrival_rates)
new_customertypelist=np.arange(0,m)
for customer in range(0,new_n):
    currenttype=np.random.choice(new_customertypelist,p=new_arrival_rates)
    new_typelist.append(currenttype)
    add=np.array([])
    for product in range(0,d):
        addh=np.random.choice(2,p=[1-new_ptypes[currenttype][product],new_ptypes[currenttype][product]])
        add=np.hstack((add,addh))
    new_finalarray=np.vstack((new_finalarray,add))

new_finalarray=np.delete(new_finalarray,0,axis=0)
new_typelist=np.array(new_typelist)

In [12]:
np.save('newcustomerdata_beta_rewards',new_finalarray)
np.save('newcustomerdata_beta_customertype',new_typelist)

## METHOD 2 - moderate

In [6]:
n=100
d=4
m=5
np.random.seed(3)

ptypes0=[[0.1,0.9],[0.2,0.8],[0.3,0.7],[0.4,0.6],[0.5,0.5]] 
ptypes1=[[0.2,0.8],[0.4,0.6],[0.5,0.5],[0.6,0.4],[0.9,0.1]] 
ptypes2=[[0.6,0.4],[0.5,0.5],[0.4,0.6],[0.8,0.2],[0.9,0.1]] 
ptypes3=[[0.7,0.3],[0.8,0.2],[0.9,0.1],[0.95,0.05],[0.95,0.05]] 
ptypes=[ptypes0,ptypes1,ptypes2,ptypes3]

finalarray=np.zeros(d)
typelist=[]
arrival_rates=[0.4,0.2,0.2,0.1,0.1]
customertypelist=np.arange(0,m)
for customer in range(0,n):
    currenttype=np.random.choice(customertypelist,p=arrival_rates)
    typelist.append(currenttype)
    add=np.array([])
    for product in range(0,d):
        addh=np.random.choice(2,p=ptypes[product][currenttype],size=(1,1))[0]
        add=np.hstack((add,addh))
    finalarray=np.vstack((finalarray,add))

finalarray=np.delete(finalarray,0,axis=0)
typelist=np.array(typelist)

np.save('pastcustomerdata_moderate_rewards',finalarray)
np.save('pastcustomerdata_moderate_customertype',typelist)

In [7]:
new_n=10000
d=4
m=5
np.random.seed(340)

new_ptypes0=[[0.05,0.95],[0.25,0.75],[0.25,0.75],[0.3,0.7],[0.6,0.4]] 
new_ptypes1=[[0.15,0.85],[0.45,0.55],[0.5,0.5],[0.55,0.45],[0.85,0.15]] 
new_ptypes2=[[0.7,0.3],[0.55,0.45],[0.55,0.45],[0.8,0.2],[0.9,0.1]] 
new_ptypes3=[[0.75,0.25],[0.75,0.25],[0.85,0.15],[0.9,0.1],[0.95,0.05]] 
new_ptypes=[new_ptypes0,new_ptypes1,new_ptypes2,new_ptypes3]

new_finalarray=np.zeros(d)
new_typelist=[]
new_arrival_rates=np.copy(arrival_rates)
new_customertypelist=np.arange(0,m)
for customer in range(0,new_n):
    currenttype=np.random.choice(new_customertypelist,p=new_arrival_rates)
    new_typelist.append(currenttype)
    add=np.array([])
    for product in range(0,d):
        addh=np.random.choice(2,p=new_ptypes[product][currenttype],size=(1,1))[0]
        add=np.hstack((add,addh))
    new_finalarray=np.vstack((new_finalarray,add))
    
new_finalarray=np.delete(new_finalarray,0,axis=0)
new_typelist=np.array(new_typelist)

np.save('newcustomerdata_moderate_rewards',new_finalarray)
np.save('newcustomerdata_moderate_customertype',new_typelist)

## METHOD 3 - simple

In [8]:
n=50
d=4
m=5
np.random.seed(112)
ptypes=[[0.2,0.8],[0.3,0.7],[0.5,0.5],[0.8,0.2],[0.9,0.1]] 

finalarray=np.zeros(d)
typelist=[]
arrival_rates=[0.4,0.2,0.2,0.1,0.1]
customertypelist=np.arange(0,m)
for customer in range(0,n):
    currenttype=np.random.choice(customertypelist,p=arrival_rates)
    typelist.append(currenttype)
    add=np.random.choice(2,p=ptypes[currenttype],size=(1,d))
    finalarray=np.vstack((finalarray,add))

finalarray=np.delete(finalarray,0,axis=0)
typelist=np.array(typelist)

np.save('pastcustomerdata_simple_rewards',finalarray)
np.save('pastcustomerdata_simple_customertype',typelist)

In [9]:
new_n=1000
d=4
m=5
np.random.seed(911)
new_ptypes=[[0.3,0.7],[0.2,0.8],[0.5,0.5],[0.7,0.3],[0.7,0.3]] 

#new_ptypes=[[0.15,0.85],[0.4,0.6],[0.6,0.4],[0.75,0.25],[0.7,0.3]] 

new_finalarray=np.zeros(d)
new_typelist=[]
new_arrival_rates=np.copy(arrival_rates)
new_customertypelist=np.arange(0,m)
for customer in range(0,new_n):
    currenttype=np.random.choice(new_customertypelist,p=new_arrival_rates)
    new_typelist.append(currenttype)
    add=np.random.choice(2,p=ptypes[currenttype],size=(1,d))
    new_finalarray=np.vstack((new_finalarray,add))

new_finalarray=np.delete(new_finalarray,0,axis=0)
new_typelist=np.array(new_typelist)

np.save('newcustomerdata_simple_rewards',new_finalarray)
np.save('newcustomerdata_simple_customertype',new_typelist)