In [None]:
# Importing libraries and setting up output directory
import numpy as np
import pandas as pd
import os

output_dir = r"D:\Projects\assignments\ml-584\Project2-main\gradientboosting\test_data"
os.makedirs(output_dir, exist_ok=True)




In [2]:
# 1. Generating linearly separable binary dataset (25×3)
np.random.seed(0)
n1, n2 = 12, 13
X1 = np.random.normal(loc=[0,0,0], scale=1, size=(n1,3))
X2 = np.random.normal(loc=[5,5,5], scale=1, size=(n2,3))
X = np.vstack([X1, X2])
y = np.array([0]*n1 + [1]*n2)
df = pd.DataFrame(np.column_stack([X, y]), columns=[f"f{i}" for i in range(1,4)]+["label"])
df.to_csv(os.path.join(output_dir, "binary_linear.csv"), index=False)



In [3]:
# 2. Generating XOR-style non-linear binary dataset (30×4)
np.random.seed(1)
n = 30
X = np.random.randn(n,4)
y = ((X[:,0]>0) ^ (X[:,1]>0)).astype(int)
df = pd.DataFrame(np.column_stack([X, y]), columns=list("ABCD")+["label"])
df.to_csv(os.path.join(output_dir, "binary_xor.csv"), index=False)



In [4]:
# 3. Generating balanced multiclass clusters (40×5, 3 classes)
np.random.seed(2)
sizes = [13,14,13]
data, labels = [], []
for k, mu in enumerate([[0]*5, [5]*5, [10]*5]):
    Xk = np.random.normal(loc=mu, scale=1, size=(sizes[k],5))
    data.append(Xk)
    labels += [k]*sizes[k]
X = np.vstack(data)
y = np.array(labels)
cols = [f"f{i}" for i in range(1,6)]
df = pd.DataFrame(np.column_stack([X, y]), columns=cols+["label"])
df.to_csv(os.path.join(output_dir, "multiclass_clusters.csv"), index=False)



In [5]:
# 4. Generating imbalanced binary dataset (50×6)
np.random.seed(3)
n = 50
X = np.random.randn(n,6)
y = np.zeros(n, dtype=int)
y[:10] = 1
cols = [f"f{i}" for i in range(1,7)]
df = pd.DataFrame(np.column_stack([X, y]), columns=cols+["label"])
df.to_csv(os.path.join(output_dir, "binary_imbalanced.csv"), index=False)



In [6]:
# 5. Generating uneven multiclass high-dimensional dataset (50×10, 5 classes)
np.random.seed(4)
counts = [5,10,15,10,10]
data, labels = [], []
for k in range(5):
    Xk = np.random.randn(counts[k],10) + k
    data.append(Xk)
    labels += [k]*counts[k]
X = np.vstack(data)
y = np.array(labels)
cols = [f"f{i}" for i in range(1,11)]
df = pd.DataFrame(np.column_stack([X, y]), columns=cols+["label"])
df.to_csv(os.path.join(output_dir, "multiclass_highdim.csv"), index=False)