In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import Binarizer

In [2]:
np.random.seed(42)

In [3]:
def bernoulli_generation(instances, dimensions, n_classes=2, density=0.05, density_sd=0.02, target=None):
    """Creates a binary DataFrame using independent Bernoulli distribution.
    Each class uses its own distribution for each attribute.
    Uses a normal distribution to find the distribution for each value `p`.
    
    :param dimensions: int - number of features
    :param instances: int - number of instances
    :param n_classes: int - number of classes, default is 2
    :param density: float - approximate ratio of 1s in the data, default is 0.05
    :param density_sd: float - standard deviation of the density, default is 0.02
    
    :returns: a pandas DataFrame with columns [0, dimensions) and a column target
    """
    
    if target is None:
        target = np.random.choice(n_classes, size=instances, replace=True)
    random_matrix = np.random.rand(instances, dimensions)
    
    df = pd.DataFrame(random_matrix).assign(target=target)
    
    for cls in range(n_classes):
        for dim in range(dimensions):
            binner = Binarizer(1.0 - (np.random.randn() * density_sd + density))
            df.loc[df.target == cls, dim] = binner.transform(df.loc[df.target == cls, dim].values.reshape(-1, 1)).reshape(-1, 1)[0]
    
    return df

print(bernoulli_generation(20, 5, 3, 0.5))

%time _ = bernoulli_generation(100000, 1000, 3)

      0    1    2    3    4  target
0   0.0  0.0  0.0  1.0  0.0       2
1   0.0  1.0  0.0  0.0  0.0       0
2   0.0  0.0  0.0  1.0  0.0       2
3   0.0  0.0  0.0  1.0  0.0       2
4   0.0  1.0  0.0  0.0  0.0       0
5   0.0  1.0  0.0  0.0  0.0       0
6   0.0  0.0  0.0  1.0  0.0       2
7   0.0  1.0  1.0  1.0  1.0       1
8   0.0  0.0  0.0  1.0  0.0       2
9   0.0  0.0  0.0  1.0  0.0       2
10  0.0  0.0  0.0  1.0  0.0       2
11  0.0  0.0  0.0  1.0  0.0       2
12  0.0  1.0  0.0  0.0  0.0       0
13  0.0  0.0  0.0  1.0  0.0       2
14  0.0  1.0  1.0  1.0  1.0       1
15  0.0  1.0  0.0  0.0  0.0       0
16  0.0  1.0  1.0  1.0  1.0       1
17  0.0  1.0  1.0  1.0  1.0       1
18  0.0  1.0  1.0  1.0  1.0       1
19  0.0  1.0  1.0  1.0  1.0       1
CPU times: user 21.3 s, sys: 216 ms, total: 21.5 s
Wall time: 21.5 s


In [4]:
def binomial_generation(instances, dimensions, n_classes=2,
                        p_mean=0.2, p_sd=0.05,
                        n_min=5, n_max=20, target=None):
    """Creates an integer DataFrame using a binomial distribution.
    Each class has its own distribution for each feature.
    Uses a normal distribution to find the values of `p`,
    and a uniform distribution for the values of `n`.
    """
    
    if target is None:
        target = np.random.choice(n_classes, size=instances, replace=True)
    random_matrix = np.zeros((instances, dimensions))
    
    df = pd.DataFrame(random_matrix).assign(target=target)
    
    for cls in range(n_classes):
        for dim in range(dimensions):
            p = np.random.randn() * p_sd + p_mean
            n = np.random.randint(n_min, n_max + 1)
            size = df.loc[df.target == cls, dim].shape
            df.loc[df.target == cls, dim] = np.random.binomial(n, p, size)
            
    return df

print(binomial_generation(20, 5, 3))
%time _ = binomial_generation(100000, 1000, 3)

      0    1    2    3    4  target
0   3.0  1.0  2.0  2.0  3.0       0
1   6.0  1.0  5.0  2.0  1.0       1
2   3.0  4.0  2.0  1.0  1.0       1
3   0.0  2.0  2.0  3.0  2.0       1
4   3.0  1.0  3.0  6.0  3.0       0
5   3.0  0.0  2.0  0.0  4.0       0
6   1.0  5.0  2.0  5.0  3.0       0
7   6.0  2.0  2.0  0.0  4.0       2
8   5.0  2.0  2.0  3.0  6.0       2
9   1.0  2.0  2.0  1.0  1.0       1
10  4.0  1.0  1.0  5.0  3.0       2
11  4.0  1.0  0.0  3.0  6.0       2
12  3.0  2.0  5.0  2.0  0.0       1
13  1.0  2.0  3.0  1.0  4.0       1
14  4.0  1.0  3.0  2.0  7.0       2
15  3.0  1.0  1.0  3.0  5.0       2
16  1.0  1.0  2.0  0.0  2.0       1
17  2.0  4.0  3.0  2.0  3.0       0
18  4.0  0.0  5.0  0.0  4.0       2
19  2.0  2.0  3.0  3.0  3.0       2
CPU times: user 23.2 s, sys: 76 ms, total: 23.2 s
Wall time: 23.2 s


In [5]:
def sparse_binomial_generation(instances, dimensions, n_classes=2,
                               density=0.05, density_sd=0.02,
                               p_mean=0.2, p_sd=0.05,
                               n_min=5, n_max=20):
    
    base = bernoulli_generation(instances, dimensions, n_classes, density, density_sd)
    weights = binomial_generation(instances, dimensions, n_classes, p_mean, p_sd, n_min, n_max, target=base.target)
    final_matrix = base.drop('target', axis=1).as_matrix() * weights.drop('target', axis=1).as_matrix()
    return pd.DataFrame(final_matrix).assign(target=base.target)

sparse_binomial_generation(20, 5, 3, density=0.3, p_mean=0.5)

Unnamed: 0,0,1,2,3,4,target
0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,6.0,0.0,2
3,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,7.0,0
5,0.0,0.0,0.0,0.0,9.0,0
6,0.0,0.0,0.0,0.0,0.0,1
7,0.0,0.0,0.0,9.0,0.0,2
8,0.0,0.0,0.0,7.0,0.0,2
9,0.0,0.0,0.0,10.0,0.0,2


In [6]:
datafolder = os.path.expanduser('~/Documents/datasets/synth/setexpansion')

bernoulli_generation(10000, 200, 50, density=0.3).to_csv(os.path.join(datafolder, 'densebinary.csv'))
bernoulli_generation(10000, 200, 50, density=0.05).to_csv(os.path.join(datafolder, 'sparsebinary.csv'))
sparse_binomial_generation(10000, 200, 30, density=0.3, p_mean=0.4).to_csv(os.path.join(datafolder, 'denseinteger.csv'))
sparse_binomial_generation(10000, 200, 30, density=0.05, p_mean=0.5).to_csv(os.path.join(datafolder, 'sparseinteger.csv'))

In [7]:
bernoulli_generation(100000, 300, 200, density=0.05).to_csv(os.path.join(datafolder, 'hugebinary.csv'))

In [8]:
sparse_binomial_generation(100000, 300, 100, density=0.05, p_mean=0.5).to_csv(os.path.join(datafolder, 'hugeinteger.csv'))