In [1]:
import numpy as np
import pandas as pd
import json
from datetime import datetime, timedelta

In [26]:
with open('../inputs/input.json') as json_file:
    params = json.load(json_file)
    
params

{'dates': {'date': {'start': '2021-02-01', 'end': '2021-02-14'}},
 'categorical': {'country': {'categories': ['UK', 'DE', 'FR', 'IT'],
   'probs': [0.31, 0.27, 0.23, 0.19],
   'indices': [0.98, 1.03, 0.99, 1.02]},
  'platform': {'categories': ['android', 'ios', 'web'],
   'probs': [0.41, 0.29, 0.3],
   'indices': [0.99, 1.06, 0.94]}},
 'values': {'payment': {'distribution': 'binomial',
   'parameters': {'n': 1, 'p': 0.2},
   'indices': False},
  'amount': {'distribution': 'exponential',
   'parameters': {'scale': 10},
   'indices': True,
   'min': 1,
   'round': 2}},
 'dependencies': {'amount': [{'from': 'payment',
    'condition': '==0',
    'value': 0}]}}

In [33]:
def map_indices(params):
    keys = params['categories']
    values = params['indices']
    
    mapper = {}
    
    for i in range(len(keys)):
        mapper[keys[i]] = values[i]
    
    return mapper



def get_indices(rows, params, data_dict):
    indices = np.ones(rows)
    
    for column in params['categorical'].keys():
        if 'indices' in params['categorical'][column].keys():
            mapper = map_indices(params['categorical'][column])
            
            indices_column = np.array([mapper[x] for x in data_dict[column]])
            
            indices = indices * indices_column
            
    return indices



def get_dates(rows, params, seed):
    # set randomiser
    rand = np.random.default_rng(seed)
    
    # if list of possible dates provided, use it
    if 'values' in params.keys():
        dates_list = params['values']
    
    # else build the list using start and end date
    else:
        # get start and end dates
        start = datetime.strptime(params['start'], '%Y-%m-%d')
        end = datetime.strptime(params['end'], '%Y-%m-%d')
        
        # get list of possible dates
        dates_list = [(start + timedelta(x)).strftime('%Y-%m-%d') for x in range((end - start).days + 1)]
    
    # use probabilities of values, if given
    if 'probs' in params.keys():
        dates = rand.choice(a=dates_list, p=params['probs'], size=rows)
    
    # else assume they have equal probability
    else:
        dates = rand.choice(a=dates_list, size=rows)
    
    return dates



def get_categorical(rows, params, seed):
    rand = np.random.default_rng(seed)
    
    # use probabilities of values, if given
    if 'probs' in params.keys():
        cat = rand.choice(a=params['categories'], p=params['probs'], size=rows)
    
    # else assume they have equal probability
    else:
        cat = rand.choice(a=params['categories'], p=params['probs'], size=rows)
        
    return cat



def get_values(rows, params, indeces, seed):
    rand = np.random.default_rng(seed)
    
    distribution = params['distribution']
    distribution_params = params['parameters'].copy()
    
    
    if params['indices'] == False:
        distribution_params_str = ', '.join([key + '=' + str(distribution_params[key]) for key in distribution_params.keys()])
        values = eval('rand.' + distribution + '(' + distribution_params_str + ', size={})'.format(rows))
    
    #elif distribution = 'binomial':
    else:
        distribution_params_str = ', '.join([key + '=' + str(distribution_params[key]) for key in distribution_params.keys()])
        values = eval('rand.' + distribution + '(' + distribution_params_str + ', size={})'.format(rows))
        values = values * indeces
    
    
    # round values
    if 'round' in params.keys():
        values = np.around(values, decimals=params['round'])
 
    # set everything lower than the min to the min
    if 'min' in params.keys():
        values[values < 1] = 1
    
    return values



def create_dataset(rows, params, seed=1):
    # create dict to sore data to
    data_dict = {}
    
    # generate dates
    if 'dates' in params.keys():
        for key in params['dates'].keys():
            data_dict[key] = get_dates(rows=rows, params=params['dates'][key], seed=seed)

    # generate categorical observations
    if 'categorical' in params.keys():
        for key in params['categorical'].keys():
            data_dict[key] = get_categorical(rows=rows, params=params['categorical'][key], seed=seed)

    # use indices
    indeces = get_indices(rows=rows, params=params, data_dict=data_dict)
            
    # generate values
    if 'values' in params.keys():
        for key in params['values'].keys():
            data_dict[key] = get_values(rows=rows, params=params['values'][key], indeces=indeces, seed=seed)
    
    # create data frame from dict
    data = pd.DataFrame(data_dict)
    
    # check dependencies
    if 'dependencies' in params.keys():
        for column in params['dependencies'].keys():
            for condition in params['dependencies'][column]:
                data.loc[eval("data['{from}'] {condition}".format(**condition)), column] = condition['value']
    
    return data

In [34]:
df = create_dataset(rows=10000, params=params, seed=1)

In [35]:
df.head()

Unnamed: 0,date,country,platform,payment,amount
0,2021-02-07,DE,ios,0,0.0
1,2021-02-08,IT,web,1,2.96
2,2021-02-11,UK,android,0,0.0
3,2021-02-14,IT,web,1,3.51
4,2021-02-01,DE,android,0,0.0


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   date      10000 non-null  object 
 1   country   10000 non-null  object 
 2   platform  10000 non-null  object 
 3   payment   10000 non-null  int64  
 4   amount    10000 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 390.8+ KB


### !!! I still need to add:
* add functionality to apply indices for binomial distribution (i.e. increase / decrease chances of different categories)