In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import kmapper as km
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from kmapper import jupyter

In [2]:
df = pd.read_csv('creditcard.csv')

In [3]:
Class = df['Class']
df.drop(['Class'], axis = 1, inplace = True)

In [4]:
def pca_df(data, var_per):
    """
    Inputs:  data - data frame 
             var_per - float in [0,1]
    Returns: pca class of data
             data frame with PC that explain var_per% of the variance of data
    """
    #instantiate PCA 
    pca = PCA(random_state=0)
    data_pca = pca.fit_transform(data)
    cumm_var = pca.explained_variance_ratio_.cumsum()
    #number of columns that explain var_per% of variance
    n = np.argmax(cumm_var >= var_per)
    #names of columns in output dataframe
    col_names = ['PC' + str(k) for k in range(1, data_pca.shape[1]+1)]
    #drop cols not necessary to explain % of variance desired
    data_PC = pd.DataFrame(data_pca, columns = col_names).iloc[:,:(n-1)]
    
    return pca, data_PC

In [5]:
pca, data_PC = pca_df(df, 0.8)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(data_PC, Class, test_size = 0.1, random_state = 1)

In [7]:
scaler = StandardScaler()
train_std = scaler.fit_transform(X_train)
test_std = scaler.fit_transform(X_test)
test = scaler.fit_transform(data_PC)

In [22]:
reg = LogisticRegression().fit(train_std,y_train)
probs = reg.predict_proba(test)

In [60]:
#[[data_PC[data_PC.columns[0]].values[i]] for i in len(data_PC[data_PC.columns[0]].values)]
data_PC.columns[0]

'PC1'

In [62]:
type(data_PC[data_PC.columns[0]])

pandas.core.series.Series

In [63]:
data_PC = [[i] for i in data_PC[data_PC.columns[0]].values]
data_PC

[[94813.86288086855],
 [94813.85463962307],
 [94812.87566247962],
 [94812.86139626782],
 [94811.85840814977],
 [94811.85470829305],
 [94809.85475373376],
 [94806.85676049431],
 [94806.859674286],
 [94804.85470181906],
 [94803.8549343962],
 [94803.8550144504],
 [94803.8612732708],
 [94802.8560279726],
 [94801.85780106684],
 [94801.85540453301],
 [94801.85522598756],
 [94800.85453574498],
 [94799.857115437],
 [94798.85476892821],
 [94797.86743746289],
 [94796.85639571816],
 [94795.8546120521],
 [94795.85577392885],
 [94791.85452799634],
 [94791.85597607668],
 [94790.8568217097],
 [94790.85537872855],
 [94790.85635131747],
 [94790.85523014184],
 [94789.85544485088],
 [94788.854740228],
 [94787.85483893269],
 [94787.85483893269],
 [94787.85459503424],
 [94787.85459503424],
 [94786.85620047456],
 [94786.85460594855],
 [94784.85564540066],
 [94784.85485671712],
 [94781.85613240086],
 [94781.85463585082],
 [94780.85533353995],
 [94780.8550338647],
 [94779.85541060829],
 [94779.85568225074],
 

In [64]:
mapper = km.KeplerMapper(verbose = 0)
graph = mapper.map(data_PC,
                    test,
                   cover = km.Cover(n_cubes=15, perc_overlap=0.8),
                   clusterer = KMeans(n_clusters = 8))
html = mapper.visualize(graph, 
                        color_values= Class,
                        color_function_name = 'centrality',
                        node_color_function = ['median', 'max', 'average'], 
                        path_html="creditCardnu.html",
                        title="credit card")
jupyter.display(path_html="creditCard.html")

AttributeError: 'list' object has no attribute 'shape'

In [40]:
type(graph)
keys = graph.keys()
keys

dict_keys(['nodes', 'links', 'simplices', 'meta_data', 'meta_nodes'])

In [43]:
nodes = graph.get("nodes")
type(nodes)

collections.defaultdict

In [46]:
node_keys = nodes.keys()

In [48]:
nodes['cube0_cluster2']

[2,
 3,
 10,
 12,
 19,
 20,
 27,
 48,
 61,
 83,
 122,
 136,
 141,
 160,
 171,
 179,
 236,
 238,
 248,
 267,
 277,
 282,
 306,
 321,
 344,
 351,
 353,
 358,
 363,
 368,
 370,
 373,
 378,
 397,
 403,
 422,
 432,
 448,
 461,
 462,
 469,
 519,
 526,
 535,
 546,
 584,
 595,
 598,
 608,
 610,
 614,
 618,
 637,
 641,
 651,
 654,
 655,
 666,
 670,
 681,
 703,
 707,
 732,
 747,
 751,
 768,
 772,
 777,
 785,
 800,
 807,
 810,
 839,
 843,
 847,
 888,
 909,
 914,
 950,
 956,
 972,
 977,
 988,
 997,
 1016,
 1030,
 1035,
 1064,
 1066,
 1074,
 1078,
 1079,
 1087,
 1088,
 1096,
 1153,
 1208,
 1218,
 1258,
 1275,
 1281,
 1298,
 1299,
 1353,
 1365,
 1373,
 1402,
 1404,
 1414,
 1415,
 1425,
 1428,
 1435,
 1443,
 1444,
 1453,
 1469,
 1472,
 1474,
 1488,
 1502,
 1534,
 1547,
 1564,
 1566,
 1587,
 1594,
 1605,
 1629,
 1640,
 1641,
 1655,
 1668,
 1709,
 1745,
 1747,
 1753,
 1800,
 1807,
 1817,
 1824,
 1842,
 1845,
 1853,
 1854,
 1863,
 1896,
 1901,
 1904,
 1941,
 1977,
 1988,
 2034,
 2072,
 2079,
 2087,
 208