In [83]:
import pandas as pd
import numpy as np
import random

In [193]:
class Simulator:
    """
    Class to generate fake data
    TODO: 
        figure out seed setting
        generate correlated distributions
        generate time series
        linear / non linear combination of variables
            generate coefficients
        add a UI?
        add option to add NAs to certain datapoints
        emphasis on creating dependent variables
    """
    
    def __init__(self, num_rows: int, seed = 42):
        """
        """
        
        self.num_rows = num_rows
        self.seed = np.random.seed(seed)
    
    def generate_categorical_column(self, num_categories: int, distribution = 'uniform', name: str = 'color'):
        """
        Returns a Pandas Series with num_rows
        """
        return pd.Series(data = self.generate_categorical_distribution(num_categories, distribution = distribution, name = name), 
                        index = range(self.num_rows), 
                        name = name,
                        dtype= 'object')
    
    def generate_categorical_distribution(self, num_categories: int, distribution = 'uniform', name: str = 'color'):
        """
        Specify a categorical feature's distribution
        By default the distribution is uniform - all categories will have an equal chance of being drawn
        User can also pass a size [num_categories] array of probabilities for each category
        """

        categories = [name + "_" + str(i+1) for i in range(num_categories)]
        if distribution == 'uniform':
            np.random.seed(self.seed)
            categorical_distribution = np.random.choice(categories, size = self.num_rows)

        elif type(distribution) == list or type(distribution) == np.array:
            categorical_distribution = np.random.choice(categories, size = self.num_rows, p = distribution)

        if distribution != 'uniform' and len(distribution) != num_categories:
            raise Exception("The number of probabilities should match the number of categories")
        
        return categorical_distribution
    
    def generate_numerical_column(self, distribution_params = ['normal', 0, 1]):
        """
        """
        return pd.Series(data = self.generate_numerical_distribution(distribution_params), 
                        index = range(self.num_rows), dtype = float)
    
    def generate_numerical_distribution(self, distribution_params = ['normal', 0, 1]):
        """
        params: list to specify a numerical distribution of format ['distribution_name', first_param, second_param]
        Example: distribution_params = ['binomial',1, 0.5]
        
        TODO: add other moments/params
                what happens if too many params are assigned?
              add other distributions
        """
        dist_type = distribution_params[0]

        if dist_type == 'normal':
            numerical_distribution = np.random.normal(distribution_params[1], distribution_params[2], size = self.num_rows)
        elif dist_type == 'binomial':
            numerical_distribution = np.random.binomial(distribution_params[1], distribution_params[2], size = self.num_rows)
        elif dist_type == 'beta':
            numerical_distribution = np.random.beta(distribution_params[1], distribution_params[2], size = self.num_rows)
        elif dist_type == 'poisson':
            numerical_distribution = np.random.poisson(distribution_params[1], size = self.num_rows)
        elif dist_type == 'logistic':
            numerical_distribution = np.random.logistic(distribution_params[1], distribution_params[2], size = self.num_rows) 

        # catch errors
        if dist_type not in ['normal', 'binomial', 'beta', 'poisson', 'logistic']:
            raise Exception("Please choose a valid probability distribution")
        if len(distribution_params) < 2:
            raise Exception("Please include parameters")

        return numerical_distribution

    def generate_dataframe(self, numeric_columns = None, categorical_columns = None, time_columns= None, col_names = None):
        """
        E.g. simulated dataset with 3 continuous and 2 categorical columns
        numeric_columns = [['normal', 0, 1], ['binomial', 1, 0.5], ['normal', 0, 0.5]]
        categorical_columns = [4, [3, [0.3, 0.5, 0.2]]]
        """
        # specify number of cat and numerical columns
        # option to create a dependent variable as a combination of independent variables

        df = pd.DataFrame(index = range(self.num_rows))
        
        if numeric_columns is not None:
            for ix in range(len(numeric_columns)):
                s = self.generate_numerical_column(numeric_columns[ix])
                df = pd.concat([df, s], axis = 1)
        
        if categorical_columns is not None:
            for ix in range(len(categorical_columns)):
                col = categorical_columns[ix]
                num_categories = col[0]
                distribution = col[1]
                if len(col) == 3:
                    name = col[2]
                elif len(col) < 3:
                    name = 'category_' + str(ix)    
                s = self.generate_categorical_column(num_categories = num_categories, 
                                                    distribution = distribution, name = name)
                df = pd.concat([df, s], axis = 1)
        
        if time_columns is not None:
            for col in time_columns:
                s = self.generate_time_column(col)
                df = pd.concat([df, s], axis = 1)
        
        if col_names is not None:
            df.columns = col_names
        
        return df

    
a = Simulator(num_rows = 100)

In [178]:
a.generate_categorical_column(num_categories = 3, distribution = [0.3, 0.5, 0.2], name = 'age')

0     age_2
1     age_3
2     age_2
3     age_2
4     age_1
      ...  
95    age_2
96    age_2
97    age_2
98    age_1
99    age_1
Length: 100, dtype: object

In [179]:
a.generate_categorical_column(num_categories = 4, distribution = [0.3, 0.5, 0.1, 0.1], name = 'age')

0     age_1
1     age_2
2     age_2
3     age_2
4     age_4
      ...  
95    age_2
96    age_2
97    age_3
98    age_3
99    age_2
Length: 100, dtype: object

In [196]:
numeric_columns = [['normal', 0, 1], ['binomial', 1, 0.5], ['normal', 0, 0.5]]
categorical_columns = [[4, 'uniform', 'age'], [3, [0.3, 0.5, 0.2]]]
df = a.generate_dataframe(numeric_columns, categorical_columns)

In [197]:
df.head()

Unnamed: 0,0,0.1,0.2,age,category_1
0,0.701757,0.0,1.144139,age_4,category_1_1
1,0.456962,1.0,0.154878,age_3,category_1_2
2,-0.126878,1.0,-1.016491,age_3,category_1_2
3,-0.253141,0.0,0.003366,age_1,category_1_3
4,-1.372811,0.0,-0.833669,age_3,category_1_2


In [188]:
df.columns = ['num_1', 'num_2', 'num_3', 'cat_1', 'cat_2']

In [189]:
df.columns

Index(['num_1', 'num_2', 'num_3', 'cat_1', 'cat_2'], dtype='object')