In [1]:
import gudhi                 as gd
import pandas                as pd
import matplotlib.pyplot     as plt
import numpy                 as np
import gudhi.representations as gdrep
from sklearn.preprocessing   import MinMaxScaler
from sklearn.pipeline        import Pipeline
from sklearn.svm             import SVC
from sklearn.neighbors       import KNeighborsClassifier
from sklearn.ensemble        import RandomForestClassifier

plt.rcParams['font.size'] = 16
plt.rcParams['font.family'] = 'serif'

strong_password_data_frame = pd.read_csv('data/DSL-StrongPasswordData.csv',
                                   # declare type of 'subject' column
                                   dtype = {'subject' : 'string'},
                                   index_col = ['subject', 'sessionIndex', 'rep'])

In [2]:
def subjects_in_range(start, stop):
    """Returns a list of labels for subjects in the subject column.

    :param start: integer between 2 and 57, inclusive
    :param stop: integer between 2 and 57, inclusive. Should be greater than or
                 equal to start.
    :returns: list of zero-padded subject labels beginning with s{start} to s{stop}
    """
    return [f's{i:03}' for i in range(start, 1 + stop) if i not in [6, 9, 14, 23, 45]]

people = [strong_password_data_frame.loc[subject] for subject in subjects_in_range(2,57)]

In [3]:
dimension = 1
last_person = 20
max_edge_length = 3.0
max_dimension = 3

In [4]:
train_diagrams_for_person = []

train_labels = []
label_idx = 0

for person, name in zip(people, subjects_in_range(2,last_person)):
    diagrams = []
    for n in range(3):
        train_labels.append(label_idx)
        points = person.loc[2*n + 1 : 2*(n+1)]
        simplicial_complex = gd.RipsComplex(points = points.to_numpy(),
                                            max_edge_length=max_edge_length)
        simplex_tree = simplicial_complex.create_simplex_tree(max_dimension = max_dimension)
        diagram = simplex_tree.persistence()
        diagrams.append(simplex_tree.persistence_intervals_in_dimension(dimension))
        
    train_diagrams_for_person.append(diagrams)
    label_idx = label_idx + 1
    print(f'Training diagrams for {name} complete.')

Training diagrams for s002 complete.
Training diagrams for s003 complete.
Training diagrams for s004 complete.
Training diagrams for s005 complete.
Training diagrams for s007 complete.
Training diagrams for s008 complete.
Training diagrams for s010 complete.
Training diagrams for s011 complete.
Training diagrams for s012 complete.
Training diagrams for s013 complete.
Training diagrams for s015 complete.
Training diagrams for s016 complete.
Training diagrams for s017 complete.
Training diagrams for s018 complete.
Training diagrams for s019 complete.
Training diagrams for s020 complete.


In [5]:
test_diagrams_for_person = []
test_labels = []
label_idx = 0

for person, name in zip(people, subjects_in_range(2,last_person)):
    diagrams = []
    test_labels.append(label_idx)
    points = person.loc[7:8] # get ith session
    simplicial_complex = gd.RipsComplex(points = points.to_numpy(),
                                        max_edge_length=max_edge_length)
    simplex_tree = simplicial_complex.create_simplex_tree(max_dimension = max_dimension)
    diagram = simplex_tree.persistence()
    diagrams.append(simplex_tree.persistence_intervals_in_dimension(dimension))
    
    test_diagrams_for_person.append(diagrams)
    label_idx = label_idx + 1
    print(f'Test diagrams for {name} complete.')

Test diagrams for s002 complete.
Test diagrams for s003 complete.
Test diagrams for s004 complete.
Test diagrams for s005 complete.
Test diagrams for s007 complete.
Test diagrams for s008 complete.
Test diagrams for s010 complete.
Test diagrams for s011 complete.
Test diagrams for s012 complete.
Test diagrams for s013 complete.
Test diagrams for s015 complete.
Test diagrams for s016 complete.
Test diagrams for s017 complete.
Test diagrams for s018 complete.
Test diagrams for s019 complete.
Test diagrams for s020 complete.


In [6]:
training_data = np.array(train_diagrams_for_person).flatten()
test_data = np.array(test_diagrams_for_person).flatten()

In [7]:
pipe = Pipeline([("Separator", gd.representations.DiagramSelector(limit=np.inf, point_type="finite")),
                 ("Scaler",    gd.representations.DiagramScaler(scalers=[([0,1], MinMaxScaler())])),
                 ("TDA",       gd.representations.PersistenceImage()),
                 ("Estimator", SVC())])

In [44]:
param =    [
            {"Scaler__use":         [True],
             "TDA":                 [gd.representations.Landscape()], 
             "TDA__resolution":     [100],
             "Estimator":           [RandomForestClassifier()]},
           
            {"Scaler__use":         [False],
             "TDA":                 [gd.representations.BottleneckDistance()], 
             "TDA__epsilon":        [0.001], 
             "Estimator":           [KNeighborsClassifier(n_neighbors=4, metric="precomputed")]}
           ]

In [45]:
from sklearn.model_selection import GridSearchCV

model = GridSearchCV(pipe, param, cv=3)

In [46]:
model = model.fit(training_data, train_labels)

In [47]:
model.best_params_

{'Estimator': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=None, max_features='auto',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=100,
                        n_jobs=None, oob_score=False, random_state=None,
                        verbose=0, warm_start=False),
 'Scaler__use': True,
 'TDA': Landscape(num_landscapes=5, resolution=100, sample_range=[nan, nan]),
 'TDA__resolution': 100}

In [48]:
model.score(training_data, train_labels)

1.0

In [13]:
model.score(test_data, test_labels)

0.125