In [1]:
import numpy as np
import time
import math
from sklearn.datasets import make_spd_matrix
from scipy.stats import multivariate_normal 
from sklearn.datasets import make_blobs

In [2]:
def generate_gaussian_dataset_concept_drift(nattr, ndist, nobjects):
    """
    Generating multivariate gaussian dataset to simulate concept drift
    
    Parameters
    ----------
    nattr: int
        number of attributes
    ndist: int
        number of different data distribution to generate
    nobjects: int
        number of objects for each data distribution
    
    Returns
    -------
    X : a matrix, numpy array of size (ndist * nobjects) x nattr
    """
    X = None
    for _ in range(ndist):
        mean = np.random.rand(nattr)
        Sigma = make_spd_matrix(n_dim = nattr) ## need to ensure that Sigma is a positive-definite matrix
        Y = np.random.multivariate_normal(mean, Sigma, nobjects)
        if X is None:
            X = Y
        else:
            X = np.vstack([X,Y])
    return X

In [3]:
X = generate_gaussian_dataset_concept_drift(nattr=3, ndist=5, nobjects=50)

In [4]:
def generate_data_of_multiple_cluster(centers = [(-5, -5), (5, 5)],
                                     cluster_std = [0.8, 1],
                                     nfeatures = 2,
                                     nsamples = 100):
    X, y = make_blobs(n_samples=100, 
                      cluster_std=cluster_std, 
                      centers=centers, 
                      n_features=2, random_state=1)
    return X, y