In [1]:
from openml.datasets.functions import get_dataset

df = get_dataset('shapes').get_data(dataset_format='dataframe')[0]
df.head()

Unnamed: 0,x,y,z,target
0,0.341007,0.318606,0.096725,human_arms_out9
1,0.329226,0.421601,0.056749,human_arms_out9
2,0.446869,0.648674,0.12409,human_arms_out9
3,0.314729,0.21786,0.070847,human_arms_out9
4,0.426678,0.919195,0.047609,human_arms_out9


In [2]:
df.shape


(16000, 4)

In [3]:
from gtda.plotting import plot_point_cloud
plot_point_cloud(df.query('target == "human_arms_out9"')[["x", "y", "z"]].values)

In [4]:
df.tail()

Unnamed: 0,x,y,z,target
15995,0.483704,0.504718,0.099081,biplane8
15996,0.268092,0.688763,0.174801,biplane8
15997,0.315984,0.280097,0.056011,biplane8
15998,0.242428,0.913443,0.054996,biplane8
15999,0.218825,0.828568,0.067281,biplane8


In [5]:
df['target'].unique()

['human_arms_out9', 'human_arms_out8', 'human_arms_out5', 'human_arms_out4', 'human_arms_out6', ..., 'biplane4', 'biplane6', 'biplane7', 'biplane9', 'biplane8']
Length: 40
Categories (40, object): ['human_arms_out9' < 'human_arms_out8' < 'human_arms_out5' < 'human_arms_out4' ... 'biplane6' < 'biplane7' < 'biplane9' < 'biplane8']

In [6]:
import numpy as np

point_clouds = np.asarray(
    [
        df.query("target == @shape")[["x", "y", "z"]].values
        for shape in df["target"].unique()
    ]
)
point_clouds.shape

(40, 400, 3)

In [7]:
from gtda.homology import VietorisRipsPersistence
from gtda.diagrams import PersistenceEntropy


In [8]:
homology_dimensions = [0, 1, 2]


persistence = VietorisRipsPersistence(
    metric="euclidean",
    homology_dimensions=homology_dimensions,
    n_jobs=6,
    collapse_edges=True,
)
persistence_diagrams = persistence.fit_transform(point_clouds)

In [9]:
persistence_entropy = PersistenceEntropy(normalize=True)
# Calculate topological feature matrix
X = persistence_entropy.fit_transform(persistence_diagrams)
# Visualise feature matrix
plot_point_cloud(X)

In [10]:
from gtda.diagrams import NumberOfPoints

# Reshape single diagram to (n_samples, n_features, 3) format
diagram = persistence_diagrams[0][None, :, :]
# Get number of points for (H0, H1, H2)
NumberOfPoints().fit_transform(diagram)

array([[399,  87,  10]])

In [11]:
from gtda.diagrams import Amplitude

Amplitude(metric='wasserstein').fit_transform(diagram)

array([[0.22923933, 0.06502641, 0.02047913]])

In [12]:
from sklearn.pipeline import make_union

# Select a variety of metrics to calculate amplitudes
metrics = [
    {"metric": metric}
    for metric in ["bottleneck", "wasserstein", "landscape", "persistence_image"]
]

# Concatenate to generate 3 + 3 + (4 x 3) = 18 topological features
feature_union = make_union(
    PersistenceEntropy(normalize=True),
    NumberOfPoints(n_jobs=-1),
    *[Amplitude(**metric, n_jobs=-1) for metric in metrics]
)

In [13]:
from gtda.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier


In [14]:
labels = np.zeros(40)
labels[10:20] = 1
labels[20:30] = 2
labels[30:] = 3

In [15]:
pipe = Pipeline(
    [
        ("features", feature_union),
        ("rf", RandomForestClassifier(oob_score=True, random_state=42)),
    ]
)
pipe.fit(persistence_diagrams, labels)
pipe["rf"].oob_score_

0.825

In [16]:
X = feature_union.fit_transform(persistence_diagrams)

In [17]:
X.shape

(40, 18)

In [18]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, labels, shuffle=True)

In [19]:
x_train.shape

(30, 18)

In [20]:
rf = RandomForestClassifier(random_state=42)
rf.fit(x_train, y_train)
rf.score(x_test, y_test)

0.8

In [21]:
y_train

array([1., 3., 2., 3., 0., 2., 0., 0., 1., 3., 1., 3., 2., 2., 2., 0., 0.,
       0., 3., 1., 2., 1., 3., 1., 1., 1., 0., 0., 2., 3.])

In [23]:
from sklearn.preprocessing import OneHotEncoder

one = OneHotEncoder()
y_train_one = one.fit_transform(y_train.reshape(-1,1))
y_test_one = one.fit_transform(y_test.reshape(-1,1))

In [24]:
y_train_one

<30x4 sparse matrix of type '<class 'numpy.float64'>'
	with 30 stored elements in Compressed Sparse Row format>

In [25]:
y_train_one.toarray()

array([[0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [26]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(max_iter=1000, solver='adam', hidden_layer_sizes=(5000,1000,), verbose=True)
mlp.fit(x_train, y_train_one.toarray())
mlp.score(x_test, y_test_one.toarray())

0.6

In [32]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(x_train, y_train_one.toarray())
knn.score(x_test, y_test_one.toarray())

0.6

In [38]:
from sklearn.metrics import confusion_matrix

confusion_matrix(np.argmax(y_test_one.toarray(), axis=1), np.argmax(mlp.predict(x_test), axis=1))

array([[1, 0, 0, 1],
       [0, 2, 0, 0],
       [0, 0, 2, 1],
       [2, 0, 0, 1]])

In [40]:
x_train.shape

(30, 18)

In [41]:
x_train[0]

array([ 2.11685355e+00,  5.46367614e+00, -5.83199766e-01,  3.99000000e+02,
        1.26000000e+02,  1.10000000e+01,  4.99348789e-02,  7.82939866e-02,
        5.07909879e-02,  4.41863947e-01,  1.55340872e-01,  5.22685245e-02,
        9.11144943e-03,  1.89019567e-02,  9.40826162e-03,  6.44797627e+02,
        2.40989554e+02,  1.98350025e+01])