In [1]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

import sklearn
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from umap import UMAP
from sklearn import datasets
import kmapper as km

## This file produces mapper visualizations for the data, the first part produces visualizations of the
## true cases in the dataset and the second produces visualizations of the full dataset

In [29]:
data = pd.read_csv('./data/Test_Data_1102_non_control.csv')
y = y = data['CaseCtrl']
X = data.values

In [30]:
# Initialize to use t-SNE
mapper = km.KeplerMapper(verbose=0)

projected_data = mapper.fit_transform(X, projection=TSNE(perplexity=40))


In [31]:
#set the filter functions
#This lens projects the data onto the age feature in the dataset
lens1 = mapper.fit_transform(X, projection=[4])
#This lens performs PCA on the data with one component
lens2 = mapper.fit_transform(X, projection=PCA(n_components=1))
#This concatenates the lenses into one filter function
filter_func = np.c_[lens1, lens2]

#This filters the colors in the data
color_vals = mapper.fit_transform(X, projection=[8])

In [34]:
graph = mapper.map(
    projected_data,
    clusterer=KMeans(n_clusters=2),
    cover=km.Cover(n_cubes=10, perc_overlap=0.20),
)

In [None]:
mapper.visualize(
    graph,
    title="Mapper visualization",
    path_html="mapper.html",
    custom_tooltips=y,
    color_values=color_vals,
    color_function_name=["Year Birth Projection"],
)

## Mapper visualizations of the full dataset

In [6]:
#Get data with paired down features
datafull = pd.read_csv('./data/Test_Data_1102_enumerated_filledin.csv')
data = datafull #.drop(columns = ['RedCapID', 'CaseCtrl', 'indexredcapid',  'Month_Birth', 'Year_Birth', 'Month_Death','Year_Death'])

#standardize data
scaler = StandardScaler()
methdeath = datafull['methoddeath'].values
ages = datafull['Age'].values
X = scaler.fit_transform(data)

In [7]:
datafull.columns

Index(['RedCapID', 'CaseCtrl', 'indexredcapid', 'Female', 'Age', 'Race',
       'Spanish', 'Month_Birth', 'Year_Birth', 'Month_Death',
       ...
       '763.0', '790.6', '1.0', '3.0', '9.0', '8.0', '2.0', '4.0', '5.0',
       '7.0'],
      dtype='object', length=148)

In [17]:
## Age as a filter function
mapper = km.KeplerMapper(verbose=0)
# projected_data = mapper.fit_transform(X, projection=[4])
projected_data = mapper.fit_transform(X, projection=TSNE(perplexity=50))

In [None]:
# Create the graph using DBSCAN as the clustering algorithm
graph = mapper.map(
    projected_data,
    clusterer=sklearn.cluster.DBSCAN(eps=0.1, min_samples=5),
    cover=km.Cover(30, 0.2),
)

y = np.round(methdeath, decimals=1)
y.sort()

mapper.visualize(
    graph,
    title="Mapper visualization",
    path_html="mapperTSNEDBSCAN.html",
    custom_tooltips=y,
)