# **Sintetic Dataset Generator**
### **Input Data**
#### Fill the *association_rules* array like in this example
It is a tuple representing:
- The 1<sup>st</sup> element is the **left** part of the Association Rule
- The 2<sup>nd</sup> element is the **right** part of the Association Rule
- The 3<sup>rd</sup> element is the **confidence** of the Association Rule (it is used for defining how likely is the generation of a similar basket)
- The 4<sup>th</sup> element is the **interest** of the Association Rule (currently it is not used)

#### Basket *Dataset size*
In the other cell you can define the number of basket to generate

In [697]:
association_rules = []
association_rules.append((['butter', 'curd'], ['milk'], 67, 67))
association_rules.append((['curd', 'milk'], ['butter'], 23, 22))
association_rules.append((['butter', 'whipped/sour'], ['milk'], 67, 66))
association_rules.append((['butter', 'milk'], ['yogurt'], 23, 22))
association_rules.append((['butter', 'yogurt'], ['milk'], 64, 64))
association_rules.append((['curd', 'whipped/sour'], ['milk'], 54, 53))
association_rules.append((['curd', 'milk'], ['yogurt'], 34, 33))
association_rules.append((['curd', 'yogurt'], ['milk'], 58, 58))
association_rules.append((['frozen', 'yogurt'], ['milk'], 50, 50))
association_rules.append((['frozen', 'milk'], ['yogurt'], 23, 22))
association_rules.append((['rolls/buns', 'yogurt'], ['milk'], 42, 42))
association_rules.append((['whipped/sour', 'yogurt'], ['milk'], 55, 55))
association_rules.append((['milk', 'whipped/sour'], ['yogurt'], 34, 34))
association_rules.append((['onions', 'other'], ['vegetables'], 40, 39))
association_rules.append((['onions', 'vegetables'], ['other'], 60, 59))
association_rules.append((['vegetables', 'yogurt'], ['whipped/sour'], 22, 21))
association_rules.append((['whipped/sour', 'yogurt'], ['vegetables'], 26, 25))
association_rules.append((['vegetables', 'whipped/sour'], ['yogurt'], 31, 31))
association_rules.append((['UHT-milk'], ['vegetables'], 20, 19))
association_rules.append((['liquor'], ['beer'], 47, 47))
association_rules.append((['red/blush'], ['beer'], 38, 37))
association_rules.append((['berries'], ['fruit'], 37, 36))
association_rules.append((['berries'], ['other'], 21, 20))
association_rules.append((['butter'], ['milk'], 46, 44))
association_rules.append((['butter'], ['vegetables'], 20, 17))
association_rules.append((['candy'], ['chocolate'], 20, 19))
association_rules.append((['curd'], ['milk'], 47, 45))
association_rules.append((['curd'], ['yogurt'], 27, 26))
association_rules.append((['dessert'], ['milk'], 28, 27))
association_rules.append((['domestic'], ['milk'], 24, 22))
association_rules.append((['eggs'], ['rolls/buns'], 24, 22))
association_rules.append((['frozen'], ['milk'], 24, 21))
association_rules.append((['grapes'], ['fruit'], 42, 41))
association_rules.append((['onions'], ['fruit'], 25, 24))
association_rules.append((['other'], ['fruit'], 24, 19))
association_rules.append((['pip'], ['fruit'], 37, 35))
association_rules.append((['root'], ['fruit'], 37, 34))
association_rules.append((['fruit/vegetable'], ['soda'], 24, 22))
association_rules.append((['grapes'], ['other'], 28, 27))
association_rules.append((['hard'], ['milk'], 29, 28))
association_rules.append((['herbs'], ['other'], 47, 47))
association_rules.append((['herbs'], ['vegetables'], 43, 42))
association_rules.append((['sliced'], ['milk'], 36, 36))
association_rules.append((['soft'], ['milk'], 35, 34))
association_rules.append((['whipped/sour'], ['milk'], 46, 44))
association_rules.append((['yogurt'], ['milk'], 43, 39))
association_rules.append((['milk'], ['yogurt'], 20, 12))
association_rules.append((['misc.'], ['soda'], 25, 25))
association_rules.append((['onions'], ['other'], 45, 45))
association_rules.append((['onions'], ['vegetables'], 30, 29))
association_rules.append((['other'], ['vegetables'], 24, 19))
association_rules.append((['packaged'], ['vegetables'], 43, 43))
association_rules.append((['pastry'], ['rolls/buns'], 20, 17))
association_rules.append((['sliced'], ['yogurt'], 21, 21))
association_rules.append((['water'], ['soda'], 26, 23))
association_rules.append((['whipped/sour'], ['vegetables'], 24, 22))
association_rules.append((['vegetables'], ['whole'], 25, 15))
association_rules.append((['whole'], ['vegetables'], 38, 31))
association_rules.append((['whipped/sour'], ['yogurt'], 28, 26))

In [698]:
dataset_size = 10000

In [699]:
products_available = ['AA', 'BB', 'CC', 'DD', 'EE']

---
### **Import** section

In [700]:
import pandas as pd
import numpy as np
import random
import math

---
### **Function** definitions

In [701]:
# Display the data
def displayAssociationRules(ars):
    arules = pd.DataFrame(ars)
    arules.columns = ["From", "To", "Confidence", "Interest"]
    arules.sort_values("Confidence", ascending=False)
    display(arules)

In [702]:
# Given the Association Rules, get the set of all products
# In order to be compatible with the rest of the script, it returns a list object (not a set)
def getProducts(dataset):    
    products = []
    # Extract data from each Association Rule
    for ar in dataset:
        for item in ar[0]:
            products.append(item)
        for item in ar[1]:
            products.append(item)
    products = list(set(products))
    return products

&nbsp;
#### **Basket Generation**
There are two ways to generate a basket. The *which* variable defines with half probabability either one way or the other.
- **Completely random**, it picks a sample of the whole list of products, and extract *k* elements, where *k* is a random variable having the highest probability on 1, and the smallest on *n*, the number of available products. It is a bounded monotonic decreasing distribution.
- **Based on the Association Rules**, accordingly to the confidences of the association rules, it extracts one of them. Then, it selects k, the size of the basket, it is a random variable centered on the half of the left side size. Finally, it decides whether to include the right item or not into the basket. This probability is given by the confidence of the choosen Association Rule too.

In [703]:
def generateBasket(ars, arbased, mixed=False, prod_AR_source=True, source=None):
    
    if prod_AR_source:
        products = getProducts(ars)
    else:
        products = source
    n = len(products)
    
    which_prob = [1 - arbased, arbased]
    which = np.random.choice([0, 1], p=which_prob)
    # 0 --> Completely Random
    # 1 --> Association Rule - Based
    
    if which == 0:
                
        # In order to define a decreasing probability, I define an array with length n
        # Where each element prob_dist[i] = n-i normalized with respect to the sum of 
        # the first n integer n*(n+1)/2 (in order to get a probability distribution)
        prob_dist = [i**5 + 1 for i in range(n)]
        prob_dist_sum = sum(prob_dist)
        prob_dist = [p/prob_dist_sum for p in prob_dist]
        prob_dist = prob_dist[::-1]
        
        # Create an array representing how many item it is possible extract
        # So as we can choose a number based on the probability
        how_many_products = list(range(1, n + 1))    
        k = np.random.choice(how_many_products, p=prob_dist)

        # Pick a sample of size k
        prod = random.sample(products, k)

    else:
    
        # Get the confidences of the Association Rules
        # Along with the sum of that values, in order to normalize the confidences array
        confidences = [ar[2] for ar in ars]
        conf_sum = sum(confidences)
        confidences = [conf/conf_sum for conf in confidences]
        
        # Create an array with the indexes of the possible AR
        ar_number = len(ars)        
        which_ar_pick = list(range(ar_number))
        
        # Take one of the Association Rules.
        # Use the confidences in order to define their probability distribution
        index = np.random.choice(which_ar_pick, p=confidences)
        AR = ars[index]        
        left_side = AR[0]
        rigth_side = AR[1]
        
        # As in the other case, define how many item it is possible to select
        # array_sum is used for normalizing the probabilities
        left_size = len(left_side)
        array_sum = left_size * (left_size + 1) * 0.5
        
        # Number of products available, at least one.
        # The probability distribution is centered on the half of the left side size of the AR
        how_many_products = list(range(1, left_size + 1))  
        
        # Creating the distribution for the basket size (at least)
        increasing = [5*i for i in range(2, 2 + math.floor(left_size/2))]
        decreasing = [2*i - 1.0/(2*i) for i in list(range(1, 1 + math.ceil(left_size/2)))[::-1]]
        size_dist = increasing + decreasing
        sum_array = sum(size_dist)
        size_dist = [p/sum_array for p in size_dist]

        # Pick k, and k products on the left side
        k = np.random.choice(how_many_products, p=size_dist)        
        prod = random.sample(left_side, k)

        # Consider to add the right element with probability proportional to the confidence of the AR
        right_prob = confidences[index]
        included = np.random.binomial(size=1, n=1, p=right_prob)[0]

        # If lucky
        if included == 1:
            prod.append(rigth_side[0])
            
        if mixed:
            # Add some random product
            prob_dist = [i**9 + 1 for i in range(n)]
            prob_dist_sum = sum(prob_dist)
            prob_dist = [p/prob_dist_sum for p in prob_dist]
            prob_dist = prob_dist[::-1]
            how_many_products = list(range(1, n + 1))    
            k = np.random.choice(how_many_products, p=prob_dist)
            prod_to_add = random.sample(products, k)
            for product in prod_to_add:
                prod.append(product)
            
    # Remove eventual duplicates
    prod = list(set(prod))
    return prod

In [704]:
def generateDataset(ars, size, arbased=0.5, mixed=False, prod_AR_source=True, source=None):
    sintetic = []
    for _ in range(size):
        sintetic.append(generateBasket(ars, arbased, mixed, prod_AR_source=prod_AR_source, source=source))
    return sintetic

#### **Save on file**

In [705]:
def export(ds, fname, aligned=True, with_title=False):
    
    # Get the maximum number of item in a basket
    max_basket_size = 0
    for basket in ds:
        if len(basket) > max_basket_size:
            max_basket_size = len(basket)
    
    file = open(fname, "w+")
    
    # Write the title if required
    if with_title:
        title = ""
        for i in range(1, max_basket_size + 1):
            title += "Product" + str(i) + ","
        title = title[:-1]
        file.write(title + "\n")
    
    # To CSV
    for basket in ds:
        bskt = ""
        for item in basket:
            bskt += item + ","
        for _ in range(max_basket_size - len(basket)):            
            bskt += ","
        bskt = bskt[:-1]
        file.write(bskt + "\n")
        
    file.close()

---
## **Execution**

In [706]:
dataset = generateDataset(association_rules, 10000, arbased=0.6, mixed=True, prod_AR_source=True, source=products_available)

In [707]:
export(dataset, "./data/sintetic/test_12.csv", with_title=True)

---