# Synthetic Data Experiments with Few Features and Complex Label Relationship

This dataset has relatively few features compared to the total number of datapoints. Some features are highly correlated, while others are not. We also have some categorical features, of different sizes.

In [1]:
import numpy as np

In [77]:
def feature_gen(n, dim, cov_min, cov_max, mean_min, mean_max, num_categories):
    """
    Generate feature data, which has one categorical feature and the rest generated as random Gaussian
    according to a random covariance matrix.
    """

    # First, generating the Gaussian data features

    # Cholesky decomposition of random matrix to get a valid covariance matrix
    M = np.random.uniform(cov_min, cov_max, size=(dim, dim))
    cov = M.T @ M
    mean = np.random.uniform(mean_min, mean_max, size=dim)
    gauss_data = np.random.multivariate_normal(mean, cov, size=n)

    # Next, generate categorical data
    cat_data = np.random.choice(np.arange(num_categories), size=n)

    # Combine
    data = np.hstack((gauss_data, cat_data[:, np.newaxis]))
    
    return data

def independent_label_gen(m, feature_data, cat_index, coeff_min, coeff_max, noise_scale):
    """
    Come up with generating function for each of the dimensions of the label.
    """
    n = feature_data.shape[1]
    labels = np.zeros(shape=(n,m))

    for i in range(m):
        
        coefficients = np.random.uniform(coeff_min, coeff_max, size=n + 1)
        # need to use multidimensional polynomial blarg
        # for j in range(n+1):
            
        #     labels[:,i] = np.polyval(coefficients[::-1], feature_data[:,i])

        # # Add some random noise to the labels
        # noise = np.random.normal(loc=0, scale=noise_scale, size=n)
        # labels[:,i] += noise

    return labels




In [78]:
xs = feature_gen(10, 5, -3, 3, -5, 5, 3)
ys = independent_label_gen(4, xs, -1, -10, 10, 0.01)

ValueError: could not broadcast input array from shape (10,) into shape (6,)

In [125]:
# Example usage
n_terms = 3  # Number of terms in the polynomial
n_features = 2  # Number of features
n_samples = 5  # Number of samples

mean = (1, 2)
cov = [[1, 0], [0, 1]]
data = np.random.multivariate_normal(mean, cov, size=n_samples)

coefficients = np.random.randn(n_terms)
powers = np.random.randint(0, 4, size=(n_terms, n_features)) 

for i in range(len(data)):
    for j in range(len(coefficients)):
        coefficients[j] * data[0]


terms = coefficients[:, np.newaxis] * (data ** powers)
label = np.sum(terms, axis=0)



ValueError: operands could not be broadcast together with shapes (5,2) (3,2) 

In [135]:
data[0]

array([-0.52401524,  1.20156095])

In [136]:
powers[0]

array([0, 2])

In [138]:
coefficients[0] * data[0]**powers[0]

array([0.60756686, 0.87717388])

In [137]:
coefficients

array([ 0.60756686, -1.51917998, -0.00533647])

In [132]:
powers.shape

(3, 2)