In [None]:
import S

### Generating Synthetic Datasets
We can create synthetic datasets that either have purely numerical, purely categorical, or mixed data. We generate a dataset with $K$ clusters and exactly $n$ data points by sampling approximately $n/K$ points in $\mathbb{R}^K$ from normal distributions with means given by the $K$ canonical basis vectors of that space ($[1, 0, 0]$, $[0, 1, 0]$ and $[0, 0, 1]$ if $K = 3$, for example) and the standard deviation $\sigma_{\text{noise}}$, to be set in each experiment. To each of these (numerical) data points, we add $Q$ categorical variables, with $K$ possible categories each. The categories for each data point are chosen according to a value $p \in [0, 1]$ that quantifies how much each category is solely attached to a cluster. If $p = 0$, each category can only be found in one specific cluster. If $p>0$, a category may be present in a cluster different from its attached cluster with probability $p$

In [1]:
from examples.synthetic_dataset_generation import generate_mixed_dataset

#Generate a synthetic dataset with 2 numerical features, 2 categorical features, 3 clusters, 0.1 noise
df = generate_mixed_dataset(n_samples=1000, n_numerical_features=3, n_categorical_features=2, n_clusters=3, p=0.1, save=False)
df

Unnamed: 0,num_feat_0,num_feat_1,num_feat_2,cat_feat_0,cat_feat_1,target
0,0.006731,0.511186,0.527471,0.0,1.0,0
1,0.242235,0.534272,0.229820,0.0,2.0,0
2,0.103889,0.306915,0.333870,0.0,0.0,0
3,-0.338709,0.133508,0.102437,0.0,0.0,0
4,0.100235,0.329353,0.125598,0.0,0.0,0
...,...,...,...,...,...,...
995,-0.258731,0.860525,1.011362,2.0,2.0,2
996,-0.279953,0.836799,1.475038,2.0,2.0,2
997,-0.162176,0.848644,0.708486,2.0,2.0,2
998,-0.108893,0.948853,0.824503,2.0,2.0,2


### Using SpecMix

Our implementation of SpecMix uses sklearn's BaseClassifier, such that it behaves similarly to other classifiers in sklearn.

In [2]:
from SpecMix.specmix import SpecMix

#Initialize the SpecMix algorithm with 3 clusters
specmix = SpecMix(n_clusters=3, random_state=0)

#Fit the algorithm to the dataset
specmix.fit(df)

#Observe the adjacency matrix created by the algorithm
specmix.adj_matrix_

array([[1.        , 0.37360046, 0.70477856, ..., 1.        , 0.        ,
        0.        ],
       [0.37360046, 1.        , 0.67091724, ..., 0.        , 1.        ,
        0.        ],
       [0.70477856, 0.67091724, 1.        , ..., 0.        , 0.        ,
        1.        ],
       ...,
       [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ]])

In [1]:
from examples.benchmark_algorithms import purity_score
#Calculate the purity score of the algorithm
predicted_labels = specmix.labels_
target_labels = df['target'].tolist()
purity_score(target_labels, predicted_labels) 

ModuleNotFoundError: No module named 'kmodes'