# Importing

In [70]:
# Data wrangling
import numpy as np
import pandas as pd  # Not a requirement of giotto-tda, but is compatible with the gtda.mapper module
import os
import time
import itertools
import open3d as o3d

# Data viz
from gtda.plotting import plot_point_cloud
from gtda.plotting import plot_diagram

# TDA magic
from gtda.homology import VietorisRipsPersistence
from gtda.mapper import (
    CubicalCover,
    make_mapper_pipeline,
    Projection,
    plot_static_mapper_graph,
    plot_interactive_mapper_graph,
    MapperInteractivePlotter
)

# ML tools
from sklearn import datasets
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn import svm
from src.feature_vectors import create_feature_vector

from sklearn.model_selection import cross_val_score

# Prepare cloud points

In [92]:
def get_ply_files(folder):
    files = list(filter(lambda file: file.split('.')[-1]=='ply', os.listdir(folder)))
    files = list(map(lambda file: os.path.join(folder, file),files))
    return files

ply_files  = get_ply_files('data/tablesPly')
ply_files += get_ply_files('data/chairsPly')
ply_files += get_ply_files('data/octopusPly')
ply_files += get_ply_files('data/spidersPly')

# if (k == None) label each group differently 
# else label files in group k with True and others with False
def label_groups(files, k):
    group_sizes = [len([f for f in os.listdir('data/'+file) if f[-3:]=='ply']) for file in files]
    group_sizes = [sum(group_sizes[:k]), group_sizes[k], sum(group_sizes[k+1:])]
    # print(group_sizes)
    labels = np.zeros(sum(group_sizes))
    labels[group_sizes[0]:group_sizes[0]+group_sizes[1]] = 1
        
    return labels
    
files = ['tablesPly','chairsPly', 'octopusPly', 'spidersPly']
labels = label_groups(files, 3)
# print(labels)
print("Majority classifier accuracy: %.3f" % (1 - sum(labels)/len(labels)))

pcd = [o3d.io.read_point_cloud(file) for file in ply_files]
pcd = [np.asarray(pc.points) for pc in pcd]

Majority classifier accuracy: 0.693


In [93]:
# shuffle data for more accurate results
shuffle_index = np.random.permutation(np.arange(0, len(labels)))
labels = np.array(labels)
pcd = np.array(pcd, dtype=object)
labels = labels[shuffle_index]
pcd = pcd[shuffle_index]

## Persistance and pipe

In [94]:
# Track connected components, loops, and voids
homology_dimensions = [0, 1, 2]

# Collapse edges to speed up H2 persistence calculation!
persistence = VietorisRipsPersistence(
    metric="euclidean",
    homology_dimensions=homology_dimensions,
    n_jobs=6,
    collapse_edges=True,
)

#filter_func = Projection(columns=[0,1,2])
filter_func = PCA(n_components=2)

cover = CubicalCover(n_intervals=4, overlap_frac=0.08)
#cover = OneDimensionalCover(kind='uniform', n_intervals=10, overlap_frac=0.1)

clusterer = DBSCAN(eps=10, metric="chebyshev")

n_jobs = 1

pipe = make_mapper_pipeline(
    filter_func=filter_func,
    cover=cover,
    clusterer=clusterer,
    verbose=False,
    n_jobs=n_jobs,
)

# Feature vector creation

In [95]:
entropy_feature_vectors = []
feature_vectors = []
start = time.time()
for i, pc in enumerate(pcd):
    print('\r', f"{int((i/len(pcd))*100)}%", end="")
    e_fv, fv = create_feature_vector(pc, pipe, persistence)

    entropy_feature_vectors.append(e_fv)
    feature_vectors.append(fv)
end = time.time()
print("Time to compute create feature vectors:", end - start, "s")

 99%Time to compute create feature vectors: 91.40781569480896 s


In [96]:
len(pcd), feature_vectors

(101,
 [[0, 4.117647058823529, 0.25735294117647056, 0.5154061624649859],
  [5, 3.75, 0.25, 0.4098214285714285],
  [5, 3.6842105263157894, 0.2046783625730994, 0.44010025062656644],
  [8, 3.5454545454545454, 0.16883116883116883, 0.39123376623376627],
  [6, 3.5555555555555554, 0.2091503267973856, 0.47619047619047616],
  [0, 5.25, 0.35, 0.657142857142857],
  [0, 5.0, 0.2631578947368421, 0.689047619047619],
  [7, 3.238095238095238, 0.1619047619047619, 0.45986394557823135],
  [5, 3.111111111111111, 0.18300653594771243, 0.2456349206349206],
  [4, 3.125, 0.20833333333333334, 0.25],
  [0, 5.25, 0.35, 0.657142857142857],
  [9, 3.0434782608695654, 0.1383399209486166, 0.34006211180124224],
  [3, 3.5714285714285716, 0.27472527472527475, 0.6096938775510203],
  [4, 3.125, 0.20833333333333334, 0.25],
  [6, 3.619047619047619, 0.18095238095238095, 0.47063492063492063],
  [5, 3.5, 0.18421052631578946, 0.4025],
  [2, 3.125, 0.20833333333333334, 0.25],
  [12, 2.5, 0.13157894736842105, 0.35],
  [2, 4.0, 0.2

## With homologies

In [97]:
num_features = len(feature_vectors[0])
clf = svm.SVC(kernel='linear', C=1, random_state=42)
best_scores = []
# We take one homology and up to three other features

for homology_idx in range(3):
    final_fvs = []
    
    # First add homology and nothing else
    for entropy_fv in entropy_feature_vectors:
        final_fvs.append(entropy_fv[homology_idx])

    # TODO add train and test
    scores = cross_val_score(clf, final_fvs, labels, cv=10)
    best_scores.append((scores.mean(), "Homology"+str(homology_idx+1)))
    print("%0.2f accuracy with a standard deviation of %0.2f  %s" % (scores.mean(), scores.std(), "Homology"+str(homology_idx+1)))

    for number_of_additional_features in range(1,4):
        combinations = list(itertools.combinations(range(num_features), number_of_additional_features))

        # Loop through all posible feature subsets of size
        for combination in combinations:
            # print(combination)
            final_fvs = []
            # First add homology and a certain number of features
            for fv_idx, entropy_fv in enumerate(entropy_feature_vectors):
                extracted_fv = [x for x in entropy_fv[homology_idx]]

                extracted_fv += [feature_vectors[fv_idx][i] for i in combination]

                final_fvs.append(extracted_fv)

            # TODO add train and test
            scores = cross_val_score(clf, final_fvs, labels, cv=10)
            best_scores.append((scores.mean(), str(combination) + " " + str(homology_idx+1)))
            print("%0.2f accuracy with a standard deviation of %0.2f  %s" % (scores.mean(), scores.std(), "Combination of features "+str(combination)))

            # print(final_fvs)
            
best_scores.sort(reverse=True)
print("\nAverage score: %0.4f" % (sum([x for x, s in best_scores]) / len(best_scores)))
print("Best scores:", best_scores[:3])

0.69 accuracy with a standard deviation of 0.02  Homology1
0.69 accuracy with a standard deviation of 0.02  Combination of features (0,)
0.69 accuracy with a standard deviation of 0.02  Combination of features (1,)
0.69 accuracy with a standard deviation of 0.02  Combination of features (2,)
0.69 accuracy with a standard deviation of 0.02  Combination of features (3,)
0.69 accuracy with a standard deviation of 0.02  Combination of features (0, 1)
0.69 accuracy with a standard deviation of 0.02  Combination of features (0, 2)
0.69 accuracy with a standard deviation of 0.02  Combination of features (0, 3)
0.69 accuracy with a standard deviation of 0.02  Combination of features (1, 2)
0.69 accuracy with a standard deviation of 0.02  Combination of features (1, 3)
0.69 accuracy with a standard deviation of 0.02  Combination of features (2, 3)
0.69 accuracy with a standard deviation of 0.02  Combination of features (0, 1, 2)
0.69 accuracy with a standard deviation of 0.02  Combination of fe

In [98]:
X_train, X_test, y_train, y_test = train_test_split(final_fvs, labels, test_size=0.33, random_state=42)

SVM = svm.LinearSVC()
SVM.fit(X_train, y_train)
SVM.predict(X_test)
round(SVM.score(X_test,y_test), 4)

0.7353

## Without homologies

In [99]:
for number_of_additional_features in range(1,6):
    combinations = list(itertools.combinations(range(num_features), number_of_additional_features))

    # Loop through all posible feature subsets of size
    for combination in combinations:
        # print(combination)
        final_fvs = []
        # First add homology and a certain number of features
        for fv_idx, fv in enumerate(feature_vectors):
            extracted_fv = [fv[i] for i in combination]

            final_fvs.append(extracted_fv)

        # TODO add train and test
        scores = cross_val_score(clf, final_fvs, labels, cv=10)
        best_scores.append((scores.mean(), str(combination) + " " + str(homology_idx)))
        print("%0.2f accuracy with a standard deviation of %0.2f  %s" % (scores.mean(), scores.std(), "Combination of features "+str(combination)))

        # print(final_fvs)


0.69 accuracy with a standard deviation of 0.02  Combination of features (0,)
0.69 accuracy with a standard deviation of 0.02  Combination of features (1,)
0.69 accuracy with a standard deviation of 0.02  Combination of features (2,)
0.69 accuracy with a standard deviation of 0.02  Combination of features (3,)
0.69 accuracy with a standard deviation of 0.02  Combination of features (0, 1)
0.69 accuracy with a standard deviation of 0.02  Combination of features (0, 2)
0.69 accuracy with a standard deviation of 0.02  Combination of features (0, 3)
0.69 accuracy with a standard deviation of 0.02  Combination of features (1, 2)
0.69 accuracy with a standard deviation of 0.02  Combination of features (1, 3)
0.69 accuracy with a standard deviation of 0.02  Combination of features (2, 3)
0.69 accuracy with a standard deviation of 0.02  Combination of features (0, 1, 2)
0.69 accuracy with a standard deviation of 0.02  Combination of features (0, 1, 3)
0.69 accuracy with a standard deviation of

In [None]:
X_train, X_test, y_train, y_test = train_test_split(final_fvs, labels, test_size=0.33, random_state=42)

SVM = svm.LinearSVC()
SVM.fit(X_train, y_train)
SVM.predict(X_test)
round(SVM.score(X_test,y_test), 4)