In [83]:
import pandas as pd
import numpy as np
import random

In [128]:
class Simulator:
    """
    Class to generate fake data
    TODO: 
        figure out seed setting
        generate correlated distributions
        generate time series
        linear / non linear combination of variables
            generate coefficients
        add a UI?
        add option to add NAs to certain datapoints
    """
    
    def __init__(self, num_rows: int, seed = 42):
        """
        """
        
        self.num_rows = num_rows
        self.seed = np.random.seed(seed)
    
    def generate_categorical_column(self, num_categories: int, distribution = 'uniform'):
        """
        Returns a Pandas Series with num_rows
        """
        return pd.Series(data = self.generate_categorical_distribution(num_categories), index = range(self.num_rows), dtype= 'object')
    
    def generate_categorical_distribution(self, num_categories: int, name: str = 'color', distribution = 'uniform'):
        """
        Specify a categorical feature's distribution
        By default the distribution is uniform - all categories will have an equal chance of being drawn
        User can also pass a size [num_categories] array of probabilities for each category
        """

        categories = [name + "_" + str(i+1) for i in range(4)]
        if distribution == 'uniform':
            np.random.seed(self.seed)
            categorical_distribution = np.random.choice(categories, size = self.num_rows)

        elif type(distribution) == list or type(distribution) == np.array:
            categorical_distribution = np.random.choice(categories, size = self.num_rows, p = distribution)
        
        return categorical_distribution
    
    def generate_numerical_column(self, distribution_params = ['normal', 0, 1]):
        """
        """
        return pd.Series(data = self.generate_numerical_distribution(distribution_params), index = range(self.num_rows), dtype = float)
    
    def generate_numerical_distribution(self, distribution_params = ['normal', 0, 1]):
        """
        params: list to specify a numerical distribution of format ['distribution_name', first_param, second_param]
        Example: distribution_params = ['binomial',1, 0.5]
        
        TODO: add other moments/params
                what happens if too many params are assigned?
        """
        dist_type = distribution_params[0]

        if dist_type == 'normal':
            numerical_distribution = np.random.normal(distribution_params[1], distribution_params[2], size = self.num_rows)
        elif dist_type == 'binomial':
            numerical_distribution = np.random.binomial(distribution_params[1], distribution_params[2], size = self.num_rows)
        elif dist_type == 'beta':
            numerical_distribution = np.random.beta(distribution_params[1], distribution_params[2], size = self.num_rows)
        elif dist_type == 'poisson':
            numerical_distribution = np.random.poisson(distribution_params[1], size = self.num_rows)
        elif dist_type == 'logistic':
            numerical_distribution = np.random.logistic(distribution_params[1], distribution_params[2], size = self.num_rows) 

        # catch errors
        if dist_type not in ['normal', 'binomial', 'beta', 'poisson', 'logistic']:
            raise Exception("Please choose a valid probability distribution")
        if len(distribution_params) < 2:
            raise Exception("Please include parameters")

        return numerical_distribution

    def generate_dataframe():
        """
        """
    
    

In [129]:
a = Simulator(num_rows = 100)

In [130]:
a.generate_categorical_column(num_categories = 4, distribution = [0.25, 0.3, 0.25, 0.2])

0     color_2
1     color_2
2     color_4
3     color_1
4     color_2
       ...   
95    color_2
96    color_2
97    color_1
98    color_3
99    color_3
Length: 100, dtype: object

In [131]:
a.generate_numerical_column(['dirichlet'])

Exception: Please choose a valid probability distribution