In [None]:
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN, OPTICS, Birch
from sklearn import decomposition
import numpy as np
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import matplotlib.pyplot as plt

In [None]:
address_data = pd.read_csv('../data/address_df.csv', index_col=0)
address_df = pd.DataFrame(data=address_data)
address_df.head()

## Calculate total eth transferred

In [None]:
total_eth_trans = []
for index in address_df.index:
    total = 0
    row = address_df.iloc[index]
    total += float(row['total_eth_sent'])
    total += float(row['total_eth_recv'])
    total_eth_trans.append(total)
address_df['total_eth_trans'] = total_eth_trans
address_df.head()

## Select top 10,000 accounts by total eth transferred

In [None]:
from tabulate import tabulate
print(address_df.keys())
sorted_df = address_df.sort_values(by='total_eth_trans', ascending=False)
sorted_df.head()


In [None]:
top_addr_df = sorted_df.head(10000)
len(top_addr_df)

In [None]:
len(top_addr_df.index)

In [None]:
top_addr_input = top_addr_df.drop('address', axis=1)

In [None]:
# top_addr_df.to_csv('../data/top_10k_addr.csv')

# Method 1: Cluster raw dataset

In [None]:
sil_scores_raw = []
ch_scores_raw = []
db_scores_raw = []
n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

for cluster_size in n_clusters:
    kmeans = KMeans(n_clusters=cluster_size)
    kmeans.fit(top_addr_input)

    #number of addresses per cluster
    print('Addresses per cluster')
    print(pd.Series(kmeans.labels_).value_counts())

    #scores
    sil_score = silhouette_score(top_addr_input, kmeans.labels_, sample_size=100000)
    sil_scores_raw.append(sil_score)
    print('Silhouette score:', sil_score)

    ch_score = calinski_harabasz_score(top_addr_input, kmeans.labels_)
    ch_scores_raw.append(ch_score)
    print('Calinski-Harabasz score:', ch_score)

    db_score = davies_bouldin_score(top_addr_input, kmeans.labels_)
    db_scores_raw.append(db_score)
    print('Davies-Bouldin score:', db_score)

    #plot addresses with cluster labels
#     plt.scatter(x_vals, y_vals, c=kmeans.labels_)
#     plt.xlabel('PC1')
#     plt.ylabel('PC2')
#     plt.title('Addresses with {} clusters'.format(cluster_size))
#     plt.show()

In [None]:
plt.plot(n_clusters, sil_scores_raw)
plt.title('Number of Clusters vs. Silhouette Score')
plt.show()
plt.plot(n_clusters, sil_scores_raw)
plt.title('Number of Clusters vs. Calinski-Harabasz Score')
plt.show()
plt.plot(n_clusters, sil_scores_raw)
plt.title('Number of Clusters vs. Davies-Bouldin Score')
plt.show()

# Method 2: After TSNE transform

In [None]:
#with t-sne
from sklearn.manifold import TSNE
X_embedded = TSNE(n_components=2).fit_transform(top_addr_input)

In [None]:
x_vals = []
y_vals = []
for val in X_embedded:
    x_vals.append(val[0])
    y_vals.append(val[1])

plt.figure(figsize=[30,30])
plt.scatter(x_vals, y_vals, alpha=0.5)
plt.title('T-SNE transformed addresses')
plt.show()

In [None]:
sil_scores_tsne_kmeans = []
ch_scores_tsne_kmeans = []
db_scores_tsne_kmeans = []
n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

for cluster_size in n_clusters:
    kmeans = KMeans(n_clusters=cluster_size)
    kmeans.fit(X_embedded)

    #number of addresses per cluster
    print('Addresses per cluster')
    print(pd.Series(kmeans.labels_).value_counts())

    #scores
    sil_score = silhouette_score(top_addr_input, kmeans.labels_, sample_size=100000)
    sil_scores_tsne_kmeans.append(sil_score)
    print('Silhouette score:', sil_score)

    ch_score = calinski_harabasz_score(top_addr_input, kmeans.labels_)
    ch_scores_tsne_kmeans.append(ch_score)
    print('Calinski-Harabasz score:', ch_score)

    db_score = davies_bouldin_score(top_addr_input, kmeans.labels_)
    db_scores_tsne_kmeans.append(db_score)
    print('Davies-Bouldin score:', db_score)

    #plot addresses with cluster labels
    plt.figure(figsize=[30,30])
    plt.scatter(x_vals, y_vals, c=kmeans.labels_)
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('Addresses with {} clusters'.format(cluster_size))
    plt.show()

In [None]:
plt.plot(n_clusters, sil_scores_tsne_kmeans)
plt.title('Number of Clusters vs. Silhouette Score')
plt.show()
plt.plot(n_clusters, ch_scores_tsne_kmeans)
plt.title('Number of Clusters vs. Calinski-Harabasz Score')
plt.show()
plt.plot(n_clusters, db_scores_tsne_kmeans)
plt.title('Number of Clusters vs. Davies-Bouldin Score')
plt.show()

# Method 3: After PCA transform

In [None]:
#dimensionality reduction
pca = decomposition.PCA(n_components=2)
pca.fit(top_addr_input)
transformed_df = pca.transform(top_addr_input)

x_vals = []
y_vals = []
for val in transformed_df:
    x_vals.append(val[0])
    y_vals.append(val[1])

#plot addresses
plt.scatter(x_vals, y_vals)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Addresses')
plt.show()

In [None]:
sil_scores_pca_kmeans = []
ch_scores_pca_kmeans = []
db_scores_pca_kmeans = []
n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

for cluster_size in n_clusters:
    kmeans = KMeans(n_clusters=cluster_size)
    kmeans.fit(transformed_df)

    #number of addresses per cluster
    print('Addresses per cluster')
    print(pd.Series(kmeans.labels_).value_counts())

    #scores
    sil_score = silhouette_score(transformed_df, kmeans.labels_, sample_size=100000)
    sil_scores_pca_kmeans.append(sil_score)
    print('Silhouette score:', sil_score)

    ch_score = calinski_harabasz_score(transformed_df, kmeans.labels_)
    ch_scores_pca_kmeans.append(ch_score)
    print('Calinski-Harabasz score:', ch_score)

    db_score = davies_bouldin_score(transformed_df, kmeans.labels_)
    db_scores_pca_kmeans.append(db_score)
    print('Davies-Bouldin score:', db_score)

    #plot addresses with cluster labels
    plt.figure(figsize=[30,30])
    plt.scatter(x_vals, y_vals, c=kmeans.labels_)
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('Addresses with {} clusters'.format(cluster_size))
    plt.show()

In [None]:
plt.plot(n_clusters, sil_scores_pca_kmeans)
plt.title('Number of Clusters vs. Silhouette Score')
plt.show()
plt.plot(n_clusters, ch_scores_pca_kmeans)
plt.title('Number of Clusters vs. Calinski-Harabasz Score')
plt.show()
plt.plot(n_clusters, db_scores_pca_kmeans)
plt.title('Number of Clusters vs. Davies-Bouldin Score')
plt.show()

In [None]:
agg_scores = []
for index in range(len(n_clusters)):
    agg_score = 0
    agg_score += (sil_scores_pca_kmeans[index] - np.min(sil_scores_pca_kmeans)) / (np.max(sil_scores_pca_kmeans) - np.min(sil_scores_pca_kmeans))
    agg_score += (ch_scores_pca_kmeans[index] - np.min(ch_scores_pca_kmeans)) / (np.max(ch_scores_pca_kmeans) - np.min(ch_scores_pca_kmeans))
    agg_score += 1/((db_scores_pca_kmeans[index] - np.min(db_scores_pca_kmeans)) / (np.max(db_scores_pca_kmeans) - np.min(db_scores_pca_kmeans)))
    agg_score = agg_score/3
    agg_scores.append(agg_score)
    
plt.plot(n_clusters, agg_scores)
plt.show()

# All three clusterings compared

In [None]:
#plot all three scores
plt.plot(n_clusters, sil_scores_raw, c='red')
plt.plot(n_clusters, sil_scores_tsne_kmeans, c='blue')
plt.plot(n_clusters, sil_scores_pca_kmeans, c='yellow')
plt.title('Number of Clusters vs. Silhouette Score')
plt.show()

plt.plot(n_clusters, ch_scores_pca_kmeans, c='yellow')
plt.plot(n_clusters, ch_scores_raw, c='red')
plt.plot(n_clusters, ch_scores_tsne_kmeans, c='blue')
plt.title('Number of Clusters vs. Calinski-Harabasz Score')
plt.show()

plt.plot(n_clusters, db_scores_raw, c='red')
plt.plot(n_clusters, db_scores_tsne_kmeans, c='blue')
plt.plot(n_clusters, db_scores_pca_kmeans, c='yellow')
plt.title('Number of Clusters vs. Davies-Bouldin Score')
plt.show()

# Method 4: Log transform, PCA, then cluster

In [None]:
top_addr_logt = top_addr_input.copy()

In [None]:
for col in top_addr_logt.keys():
    logt_col = [float(x)+1 for x in top_addr_logt[col]]
    logt_col = np.log(logt_col)
    top_addr_logt[col] = logt_col    

In [None]:
top_addr_logt.head()

In [None]:
top_addr_logt = top_addr_logt.drop('send_count', axis=1)
top_addr_logt = top_addr_logt.drop('receive_count', axis=1)
top_addr_logt = top_addr_logt.drop('max_nonce', axis=1)
top_addr_logt = top_addr_logt.drop('total_eth_trans', axis=1)
top_addr_logt.head()

In [None]:
#dimensionality reduction
pca = decomposition.PCA(n_components=2)
pca.fit(top_addr_logt)
transformed_df = pca.transform(top_addr_logt)

x_vals = []
y_vals = []
for val in transformed_df:
    x_vals.append(val[0])
    y_vals.append(val[1])


In [None]:
sil_scores_log_pca_kmeans = []
ch_scores_log_pca_kmeans = []
db_scores_log_pca_kmeans = []
n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

for cluster_size in n_clusters:
    kmeans = KMeans(n_clusters=cluster_size)
    kmeans.fit(transformed_df)

    #number of addresses per cluster
    print('Addresses per cluster')
    print(pd.Series(kmeans.labels_).value_counts())

    #scores
    sil_score = silhouette_score(transformed_df, kmeans.labels_, sample_size=100000)
    sil_scores_log_pca_kmeans.append(sil_score)
    print('Silhouette score:', sil_score)

    ch_score = calinski_harabasz_score(transformed_df, kmeans.labels_)
    ch_scores_log_pca_kmeans.append(ch_score)
    print('Calinski-Harabasz score:', ch_score)

    db_score = davies_bouldin_score(transformed_df, kmeans.labels_)
    db_scores_log_pca_kmeans.append(db_score)
    print('Davies-Bouldin score:', db_score)
    
    if cluster_size == 4:
        #export unique address with clusters
        top_addr_df['clusters'] = kmeans.labels_
        top_addr_df['pca_x_vals'] = x_vals
        top_addr_df['pca_y_vals'] = y_vals
        print(top_addr_df.head(5))
#         top_addr_df.to_csv('addresses_pca_kmeans.csv')
    
    plot_df = pd.DataFrame()
    plot_df['x_vals'] = x_vals
    plot_df['y_vals'] = y_vals
    plot_df['cluster'] = kmeans.labels_
    
    #plot addresses
    plt.rc('font', size=20)
    plt.figure(figsize=(10,10))
    plt.scatter(plot_df['x_vals'][plot_df['cluster'] == 0], plot_df['y_vals'][plot_df['cluster'] == 0], label='0',c='orange',alpha=0.5)
    plt.scatter(plot_df['x_vals'][plot_df['cluster'] == 1], plot_df['y_vals'][plot_df['cluster'] == 1], label='1',c='navy',alpha=0.5)
    try:
        plt.scatter(plot_df['x_vals'][plot_df['cluster'] == 2], plot_df['y_vals'][plot_df['cluster'] == 2], label='2',c='lightblue',alpha=0.5)
        plt.scatter(plot_df['x_vals'][plot_df['cluster'] == 3], plot_df['y_vals'][plot_df['cluster'] == 3], label='3',c='darkred',alpha=0.5)
    except:
        continue
    plt.legend()
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('Method 4, Tx statistics, {} clusters'.format(cluster_size))
    plt.show()

## Compare all 4 methods

In [None]:
#plot all three scores
plt.plot(n_clusters, sil_scores_raw, c='red')
plt.plot(n_clusters, sil_scores_tsne_kmeans, c='blue')
plt.plot(n_clusters, sil_scores_pca_kmeans, c='yellow')
plt.plot(n_clusters, sil_scores_log_pca_kmeans, c='green')
plt.title('Number of Clusters vs. Silhouette Score')
plt.show()

plt.plot(n_clusters, ch_scores_pca_kmeans, c='yellow')
plt.plot(n_clusters, ch_scores_raw, c='red')
plt.plot(n_clusters, ch_scores_tsne_kmeans, c='blue')
plt.plot(n_clusters, ch_scores_log_pca_kmeans, c='green')
plt.title('Number of Clusters vs. Calinski-Harabasz Score')
plt.show()

plt.plot(n_clusters, db_scores_raw, c='red')
plt.plot(n_clusters, db_scores_tsne_kmeans, c='blue')
plt.plot(n_clusters, db_scores_pca_kmeans, c='yellow')
plt.plot(n_clusters, db_scores_log_pca_kmeans, c='green')
plt.title('Number of Clusters vs. Davies-Bouldin Score')
plt.show()

# Method 5: Log transform, tsne projection to 2-d space, cluster

In [None]:
#dimensionality reduction
tsne = TSNE(n_components=2)
# tsne.fit(top_addr_logt)
transformed_df = tsne.fit_transform(top_addr_logt)

x_vals = []
y_vals = []
for val in transformed_df:
    x_vals.append(val[0])
    y_vals.append(val[1])

In [None]:
sil_scores_log_tsne_kmeans = []
ch_scores_log_tsne_kmeans = []
db_scores_log_tsne_kmeans = []
n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

for cluster_size in n_clusters:
    kmeans = KMeans(n_clusters=cluster_size)
    kmeans.fit(transformed_df)
    

    #number of addresses per cluster
    print('Addresses per cluster')
    print(pd.Series(kmeans.labels_).value_counts())

    #scores
    sil_score = silhouette_score(transformed_df, kmeans.labels_, sample_size=100000)
    sil_scores_log_tsne_kmeans.append(sil_score)
    print('Silhouette score:', sil_score)

    ch_score = calinski_harabasz_score(transformed_df, kmeans.labels_)
    ch_scores_log_tsne_kmeans.append(ch_score)
    print('Calinski-Harabasz score:', ch_score)

    db_score = davies_bouldin_score(transformed_df, kmeans.labels_)
    db_scores_log_tsne_kmeans.append(db_score)
    print('Davies-Bouldin score:', db_score)
    
    #plot addresses
    plt.figure(figsize=[30,30])
    plt.scatter(x_vals, y_vals, c=kmeans.labels_)
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('Addresses in {} clusters-Kmeans'.format(cluster_size))
    plt.show()
    


## Compare all 5 methods

In [None]:
#plot all three scores
plt.plot(n_clusters, sil_scores_raw, c='red')
plt.plot(n_clusters, sil_scores_tsne_kmeans, c='blue')
plt.plot(n_clusters, sil_scores_pca_kmeans, c='yellow')
plt.plot(n_clusters, sil_scores_log_pca_kmeans, c='green')
plt.plot(n_clusters, sil_scores_log_tsne_kmeans, c='purple')
plt.title('Number of Clusters vs. Silhouette Score')
plt.show()

plt.plot(n_clusters, ch_scores_pca_kmeans, c='yellow')
plt.plot(n_clusters, ch_scores_raw, c='red')
plt.plot(n_clusters, ch_scores_tsne_kmeans, c='blue')
plt.plot(n_clusters, ch_scores_log_pca_kmeans, c='green')
plt.plot(n_clusters, ch_scores_log_tsne_kmeans, c='purple')
plt.title('Number of Clusters vs. Calinski-Harabasz Score')
plt.show()

plt.plot(n_clusters, db_scores_raw, c='red')
plt.plot(n_clusters, db_scores_tsne_kmeans, c='blue')
plt.plot(n_clusters, db_scores_pca_kmeans, c='yellow')
plt.plot(n_clusters, db_scores_log_pca_kmeans, c='green')
plt.plot(n_clusters, ch_scores_log_tsne_kmeans, c='purple')
plt.title('Number of Clusters vs. Davies-Bouldin Score')
plt.show()

# Method 6: Log transform, pca, DBSCAN

In [None]:
#dimensionality reduction
pca = decomposition.PCA(n_components=2)
pca.fit(top_addr_logt)
transformed_df = pca.transform(top_addr_logt)

x_vals = []
y_vals = []
for val in transformed_df:
    x_vals.append(val[0])
    y_vals.append(val[1])

In [None]:
sil_scores_log_pca_dbscan = []
ch_scores_log_pca_dbscan = []
db_scores_log_pca_dbscan = []
eps_values = [.2, .4, .5, 1, 2, 3]
n_clusters_dbscan1 = []

for eps_val in eps_values:
    dbscan = DBSCAN(eps=eps_val)
    dbscan.fit(transformed_df)
    
    #number of addresses per cluster
    print('Addresses per cluster')
    print(pd.Series(dbscan.labels_).value_counts())
    n_clusters_dbscan1.append(len(pd.Series(dbscan.labels_).value_counts()))

    #scores
    sil_score = silhouette_score(transformed_df, dbscan.labels_, sample_size=100000)
    sil_scores_log_pca_dbscan.append(sil_score)
    print('Silhouette score:', sil_score)

    ch_score = calinski_harabasz_score(transformed_df, dbscan.labels_)
    ch_scores_log_pca_dbscan.append(ch_score)
    print('Calinski-Harabasz score:', ch_score)

    db_score = davies_bouldin_score(transformed_df, dbscan.labels_)
    db_scores_log_pca_dbscan.append(db_score)
    print('Davies-Bouldin score:', db_score)

    #plot addresses
    plt.figure(figsize=[30,30])
    plt.scatter(x_vals, y_vals, c=dbscan.labels_)
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('Addresses clustered with DBSCAN and epsilon value {}'.format(eps_val))
    plt.show()

In [None]:
#rm outlier clusterings from dbscan

n_clusters_dbscan1 = n_clusters_dbscan1[1:]
sil_scores_log_pca_dbscan = sil_scores_log_pca_dbscan[1:]
ch_scores_log_pca_dbscan = ch_scores_log_pca_dbscan[1:]
db_scores_log_pca_dbscan = db_scores_log_pca_dbscan[1:]

In [None]:
#plot all three scores
# plt.plot(eps_values, sil_scores_log_pca_dbscan, c='orange')

# plt.title('Number of Clusters vs. Silhouette Score')
# plt.show()

# plt.plot(eps_values, ch_scores_log_pca_dbscan, c='orange')

# plt.title('Number of Clusters vs. Calinski-Harabasz Score')
# plt.show()

# plt.plot(eps_values, db_scores_log_pca_dbscan, c='orange')

# plt.title('Number of Clusters vs. Davies-Bouldin Score')
# plt.show()

In [None]:
#plot all three scores
plt.plot(n_clusters, sil_scores_raw, c='red')
plt.plot(n_clusters, sil_scores_tsne_kmeans, c='blue')
plt.plot(n_clusters, sil_scores_pca_kmeans, c='yellow')
plt.plot(n_clusters, sil_scores_log_pca_kmeans, c='green')
plt.plot(n_clusters, sil_scores_log_tsne_kmeans, c='purple')
plt.plot(n_clusters_dbscan1, sil_scores_log_pca_dbscan, c='teal')
plt.title('Number of Clusters vs. Silhouette Score')
plt.show()

plt.plot(n_clusters, ch_scores_pca_kmeans, c='yellow')
plt.plot(n_clusters, ch_scores_raw, c='red')
plt.plot(n_clusters, ch_scores_tsne_kmeans, c='blue')
plt.plot(n_clusters, ch_scores_log_pca_kmeans, c='green')
plt.plot(n_clusters, ch_scores_log_tsne_kmeans, c='purple')
plt.plot(n_clusters_dbscan1, ch_scores_log_pca_dbscan, c='teal')
plt.title('Number of Clusters vs. Calinski-Harabasz Score')
plt.show()

plt.plot(n_clusters, db_scores_raw, c='red')
plt.plot(n_clusters, db_scores_tsne_kmeans, c='blue')
plt.plot(n_clusters, db_scores_pca_kmeans, c='yellow')
plt.plot(n_clusters, db_scores_log_pca_kmeans, c='green')
plt.plot(n_clusters, ch_scores_log_tsne_kmeans, c='purple')
plt.plot(n_clusters_dbscan1, db_scores_log_pca_dbscan, c='teal')
plt.title('Number of Clusters vs. Davies-Bouldin Score')
plt.show()

# Method 7: log transform, cluster, then pca and plot

In [None]:
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
sil_scores_log_kmeans = []
ch_scores_log_kmeans = []
db_scores_log_kmeans = []
n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

#dimensionality reduction
pca = decomposition.PCA(n_components=2)
pca.fit(top_addr_logt)
transformed_df = pca.transform(top_addr_logt)

x_vals = []
y_vals = []
for val in transformed_df:
    x_vals.append(val[0])
    y_vals.append(val[1])


for cluster_size in n_clusters:
    kmeans = KMeans(n_clusters=cluster_size)
    kmeans.fit(top_addr_logt)
    

    #number of addresses per cluster
    print('Addresses per cluster')
    print(pd.Series(kmeans.labels_).value_counts())

    #scores
    sil_score = silhouette_score(top_addr_logt, kmeans.labels_, sample_size=100000)
    sil_scores_log_kmeans.append(sil_score)
    print('Silhouette score:', sil_score)

    ch_score = calinski_harabasz_score(top_addr_logt, kmeans.labels_)
    ch_scores_log_kmeans.append(ch_score)
    print('Calinski-Harabasz score:', ch_score)

    db_score = davies_bouldin_score(top_addr_logt, kmeans.labels_)
    db_scores_log_kmeans.append(db_score)
    print('Davies-Bouldin score:', db_score)
    

    
    #plot addresses
    plt.figure(figsize=[30,30])
    plt.scatter(x_vals, y_vals, c=kmeans.labels_)
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('Addresses in {} clusters after log transform and kmeans clustering'.format(cluster_size))
    plt.show()
    


In [None]:
#plot all three scores
plt.plot(n_clusters, sil_scores_raw, c='red')
plt.plot(n_clusters, sil_scores_tsne_kmeans, c='blue')
plt.plot(n_clusters, sil_scores_pca_kmeans, c='yellow')
plt.plot(n_clusters, sil_scores_log_pca_kmeans, c='green')
plt.plot(n_clusters, sil_scores_log_tsne_kmeans, c='purple')
plt.plot(n_clusters_dbscan1, sil_scores_log_pca_dbscan, c='teal')
plt.plot(n_clusters, sil_scores_log_kmeans, c='black')
plt.title('Number of Clusters vs. Silhouette Score')
plt.show()

plt.plot(n_clusters, ch_scores_pca_kmeans, c='yellow')
plt.plot(n_clusters, ch_scores_raw, c='red')
plt.plot(n_clusters, ch_scores_tsne_kmeans, c='blue')
plt.plot(n_clusters, ch_scores_log_pca_kmeans, c='green')
plt.plot(n_clusters, ch_scores_log_tsne_kmeans, c='purple')
plt.plot(n_clusters_dbscan1, ch_scores_log_pca_dbscan, c='teal')
plt.plot(n_clusters, ch_scores_log_kmeans, c='black')
plt.title('Number of Clusters vs. Calinski-Harabasz Score')
plt.show()

plt.plot(n_clusters, db_scores_raw, c='red')
plt.plot(n_clusters, db_scores_tsne_kmeans, c='blue')
plt.plot(n_clusters, db_scores_pca_kmeans, c='yellow')
plt.plot(n_clusters, db_scores_log_pca_kmeans, c='green')
plt.plot(n_clusters, ch_scores_log_tsne_kmeans, c='purple')
plt.plot(n_clusters_dbscan1, db_scores_log_pca_dbscan, c='teal')
plt.plot(n_clusters, db_scores_log_kmeans, c='black')
plt.title('Number of Clusters vs. Davies-Bouldin Score')
plt.show()

## Method 8: log transform, birch cluster, pca to visualize

In [None]:
from sklearn.cluster import Birch

sil_scores_log_birch = []
ch_scores_log_birch = []
db_scores_log_birch = []
n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

#dimensionality reduction
pca = decomposition.PCA(n_components=2)
pca.fit(top_addr_logt)
transformed_df = pca.transform(top_addr_logt)

x_vals = []
y_vals = []
for val in transformed_df:
    x_vals.append(val[0])
    y_vals.append(val[1])


for cluster_size in n_clusters:
    birch = Birch(n_clusters=cluster_size)
    birch.fit(top_addr_logt)
    

    #number of addresses per cluster
    print('Addresses per cluster')
    print(pd.Series(birch.labels_).value_counts())

    #scores
    sil_score = silhouette_score(top_addr_logt, birch.labels_, sample_size=100000)
    sil_scores_log_birch.append(sil_score)
    print('Silhouette score:', sil_score)

    ch_score = calinski_harabasz_score(top_addr_logt, birch.labels_)
    ch_scores_log_birch.append(ch_score)
    print('Calinski-Harabasz score:', ch_score)

    db_score = davies_bouldin_score(top_addr_logt, birch.labels_)
    db_scores_log_birch.append(db_score)
    print('Davies-Bouldin score:', db_score)
    

    
    #plot addresses
    plt.figure(figsize=[30,30])
    plt.scatter(x_vals, y_vals, c=birch.labels_)
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('Addresses in {} clusters after log transform and birch clustering'.format(cluster_size))
    plt.show()
    

In [None]:
#plot all three scores
plt.plot(n_clusters, sil_scores_raw, c='red')
# plt.plot(n_clusters, sil_scores_tsne_kmeans, c='blue')
plt.plot(n_clusters, sil_scores_pca_kmeans, c='yellow')
plt.plot(n_clusters, sil_scores_log_pca_kmeans, c='green')
# plt.plot(n_clusters, sil_scores_log_tsne_kmeans, c='purple')
plt.plot(n_clusters_dbscan1, sil_scores_log_pca_dbscan, c='teal')
plt.plot(n_clusters, sil_scores_log_kmeans, c='black')
plt.plot(n_clusters, sil_scores_log_birch, c='pink')
plt.title('Number of Clusters vs. Silhouette Score')
plt.show()

plt.plot(n_clusters, ch_scores_pca_kmeans, c='yellow')
plt.plot(n_clusters, ch_scores_raw, c='red')
# plt.plot(n_clusters, ch_scores_tsne_kmeans, c='blue')
plt.plot(n_clusters, ch_scores_log_pca_kmeans, c='green')
# plt.plot(n_clusters, ch_scores_log_tsne_kmeans, c='purple')
plt.plot(n_clusters_dbscan1, ch_scores_log_pca_dbscan, c='teal')
plt.plot(n_clusters, ch_scores_log_kmeans, c='black')
plt.plot(n_clusters, ch_scores_log_birch, c='pink')
plt.title('Number of Clusters vs. Calinski-Harabasz Score')
plt.show()

plt.plot(n_clusters, db_scores_raw, c='red')
# plt.plot(n_clusters, db_scores_tsne_kmeans, c='blue')
plt.plot(n_clusters, db_scores_pca_kmeans, c='yellow')
plt.plot(n_clusters, db_scores_log_pca_kmeans, c='green')
# plt.plot(n_clusters, ch_scores_log_tsne_kmeans, c='purple')
plt.plot(n_clusters_dbscan1, db_scores_log_pca_dbscan, c='teal')
plt.plot(n_clusters, db_scores_log_kmeans, c='black')
plt.plot(n_clusters, db_scores_log_birch, c='pink')
plt.title('Number of Clusters vs. Davies-Bouldin Score')
plt.show()

## Method 9: log, dbscan, pca to viz

In [None]:
#dimensionality reduction
pca = decomposition.PCA(n_components=2)
pca.fit(top_addr_logt)
transformed_df = pca.transform(top_addr_logt)

x_vals = []
y_vals = []
for val in transformed_df:
    x_vals.append(val[0])
    y_vals.append(val[1])

In [None]:
sil_scores_log_dbscan = []
ch_scores_log_dbscan = []
db_scores_log_dbscan = []
eps_values = [.2, .4, .5, 1, 2, 3]
n_clusters_log_dbscan = []

for eps_val in eps_values:
    dbscan = DBSCAN(eps=eps_val)
    dbscan.fit(top_addr_logt)
    
    #number of addresses per cluster
    print('Addresses per cluster')
    print(pd.Series(dbscan.labels_).value_counts())
    n_clusters_log_dbscan.append(len(pd.Series(dbscan.labels_).value_counts()))

    #scores
    sil_score = silhouette_score(top_addr_logt, dbscan.labels_, sample_size=100000)
    sil_scores_log_dbscan.append(sil_score)
    print('Silhouette score:', sil_score)

    ch_score = calinski_harabasz_score(top_addr_logt, dbscan.labels_)
    ch_scores_log_dbscan.append(ch_score)
    print('Calinski-Harabasz score:', ch_score)

    db_score = davies_bouldin_score(top_addr_logt, dbscan.labels_)
    db_scores_log_dbscan.append(db_score)
    print('Davies-Bouldin score:', db_score)

    #plot addresses
    plt.figure(figsize=[30,30])
    plt.scatter(x_vals, y_vals, c=dbscan.labels_)
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('Addresses clustered with DBSCAN and epsilon value {}'.format(eps_val))
    plt.show()

In [None]:
#plot all three scores
plt.figure(figsize=(30,30))
plt.plot(n_clusters, sil_scores_raw, c='red')
plt.plot(n_clusters_log_dbscan, sil_scores_log_dbscan, c='blue')
plt.plot(n_clusters, sil_scores_pca_kmeans, c='yellow')
plt.plot(n_clusters, sil_scores_log_pca_kmeans, c='green')
# plt.plot(n_clusters, sil_scores_log_tsne_kmeans, c='purple')
plt.plot(n_clusters_dbscan1, sil_scores_log_pca_dbscan, c='teal')
plt.plot(n_clusters, sil_scores_log_kmeans, c='black')
plt.plot(n_clusters, sil_scores_log_birch, c='pink')
plt.title('Number of Clusters vs. Silhouette Score')
plt.show()

plt.figure(figsize=(30,30))
plt.plot(n_clusters, ch_scores_pca_kmeans, c='yellow')
plt.plot(n_clusters, ch_scores_raw, c='red')
plt.plot(n_clusters_log_dbscan, ch_scores_log_dbscan, c='blue')
plt.plot(n_clusters, ch_scores_log_pca_kmeans, c='green')
# plt.plot(n_clusters, ch_scores_log_tsne_kmeans, c='purple')
plt.plot(n_clusters_dbscan1, ch_scores_log_pca_dbscan, c='teal')
plt.plot(n_clusters, ch_scores_log_kmeans, c='black')
plt.plot(n_clusters, ch_scores_log_birch, c='pink')
plt.title('Number of Clusters vs. Calinski-Harabasz Score')
plt.show()

plt.figure(figsize=(30,30))
plt.plot(n_clusters, db_scores_raw, c='red')
plt.plot(n_clusters_log_dbscan, db_scores_log_dbscan, c='blue')
plt.plot(n_clusters, db_scores_pca_kmeans, c='yellow')
plt.plot(n_clusters, db_scores_log_pca_kmeans, c='green')
# plt.plot(n_clusters, ch_scores_log_tsne_kmeans, c='purple')
plt.plot(n_clusters_dbscan1, db_scores_log_pca_dbscan, c='teal')
plt.plot(n_clusters, db_scores_log_kmeans, c='black')
plt.plot(n_clusters, db_scores_log_birch, c='pink')
plt.title('Number of Clusters vs. Davies-Bouldin Score')
plt.show()

## Method 10: log, pca, birch

In [None]:
from sklearn.cluster import Birch

sil_scores_log_pca_birch = []
ch_scores_log_pca_birch = []
db_scores_log_pca_birch = []
n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

#dimensionality reduction
pca = decomposition.PCA(n_components=2)
pca.fit(top_addr_logt)
transformed_df = pca.transform(top_addr_logt)

x_vals = []
y_vals = []
for val in transformed_df:
    x_vals.append(val[0])
    y_vals.append(val[1])


for cluster_size in n_clusters:
    birch = Birch(n_clusters=cluster_size)
    birch.fit(transformed_df)
    

    #number of addresses per cluster
    print('Addresses per cluster')
    print(pd.Series(birch.labels_).value_counts())

    #scores
    sil_score = silhouette_score(transformed_df, birch.labels_, sample_size=100000)
    sil_scores_log_pca_birch.append(sil_score)
    print('Silhouette score:', sil_score)

    ch_score = calinski_harabasz_score(transformed_df, birch.labels_)
    ch_scores_log_pca_birch.append(ch_score)
    print('Calinski-Harabasz score:', ch_score)

    db_score = davies_bouldin_score(transformed_df, birch.labels_)
    db_scores_log_pca_birch.append(db_score)
    print('Davies-Bouldin score:', db_score)
    

    
    #plot addresses
    plt.figure(figsize=[30,30])
    plt.scatter(x_vals, y_vals, c=birch.labels_)
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('Addresses in {} clusters after log t, pca, and birch clustering'.format(cluster_size))
    plt.show()
    

In [None]:
#create df for clustering results
results_df = pd.DataFrame()
results_df['n_clusters'] = range(2,12)
results_df = results_df.set_index('n_clusters')
results_df['raw k-means'] = sil_scores_raw
results_df['log, k-means'] = sil_scores_log_kmeans
results_df['log, BIRCH'] = sil_scores_log_birch
results_df['log, PCA, k-means'] = sil_scores_log_pca_kmeans
results_df['log, PCA, BIRCH'] = sil_scores_log_pca_birch
results_df['log, PCA, DBSCAN'] = [None, None, None, 0.8785447982724504, None, None, 0.7299287017776335, None, None, None]
#add dbscan
results_df.head(10)

In [None]:
print(n_clusters_dbscan1[1:4])
print(sil_scores_log_pca_dbscan[1:4])

In [None]:
import pandas as pd
from tabulate import tabulate
method_names = ['1', '2', '3', '4', '5', '6']
method_techniques = ['raw k-means', 'log, k-means', 'log, BIRCH', 'log, PCA, k-means', 'log, PCA, BIRCH', 'log, PCA, DBSCAN']
method_df = pd.DataFrame()
method_df['Method Number'] = method_names
method_df['Methods'] = method_techniques
method_df = method_df.set_index('Method Number')
print(tabulate(method_df, headers = 'keys', tablefmt = 'fancy_grid'))

In [None]:
method4_df = pd.DataFrame()
method4_df['sil'] = sil_scores_log_pca_kmeans
method4_df['ch'] = ch_scores_log_pca_kmeans
method4_df['db'] = db_scores_log_pca_kmeans
# method4_df.to_csv('tx_m4_scores.csv')

In [None]:
#plot all three scores
plt.rc('font', size=16)
plt.figure(figsize=(10,10))
plt.plot(n_clusters, sil_scores_raw, label='1', c='red')
# plt.plot(n_clusters_log_dbscan, sil_scores_log_dbscan, c='blue')
# plt.plot(n_clusters, sil_scores_pca_kmeans, c='yellow')
plt.plot(n_clusters, sil_scores_log_kmeans, label='2', c='blue')
plt.plot(n_clusters, sil_scores_log_birch, label='3', c='green')
plt.plot(n_clusters, sil_scores_log_pca_kmeans, label='4', c='yellow')
plt.plot(n_clusters, sil_scores_log_pca_birch, label='5', c='purple')
plt.plot(n_clusters_dbscan1[1:4], sil_scores_log_pca_dbscan[1:4], label='6', c='teal')
plt.legend()
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Number of Clusters vs. Silhouette Score - Tx Data')
plt.xticks(ticks=range(2, 13))
plt.grid(visible=True,axis='x')
plt.grid(visible=True,axis='y')
plt.show()

plt.figure(figsize=(10,10))
plt.plot(n_clusters, ch_scores_raw, label='1', c='red')
# plt.plot(n_clusters_log_dbscan, sil_scores_log_dbscan, c='blue')
# plt.plot(n_clusters, sil_scores_pca_kmeans, c='yellow')
plt.plot(n_clusters, ch_scores_log_kmeans, label='2', c='blue')
plt.plot(n_clusters, ch_scores_log_birch, label='3', c='green')
plt.plot(n_clusters, ch_scores_log_pca_kmeans, label='4', c='yellow')
plt.plot(n_clusters, ch_scores_log_pca_birch, label='5', c='purple')
plt.plot(n_clusters_dbscan1[1:4], ch_scores_log_pca_dbscan[1:4], label='6', c='teal')
plt.legend()
plt.xlabel('Number of Clusters')
plt.ylabel('Calinski-Harabasz Score')
plt.title('Number of Clusters vs. Calinski-Harabasz Score - Tx Data')
plt.xticks(ticks=range(2, 13))
plt.grid(visible=True,axis='x')
plt.grid(visible=True,axis='y')
plt.show()

plt.figure(figsize=(10,10))
plt.plot(n_clusters, db_scores_raw, label='1', c='red')
# plt.plot(n_clusters_log_dbscan, sil_scores_log_dbscan, c='blue')
# plt.plot(n_clusters, sil_scores_pca_kmeans, c='yellow')
plt.plot(n_clusters, db_scores_log_kmeans, label='2', c='blue')
plt.plot(n_clusters, db_scores_log_birch, label='3', c='green')
plt.plot(n_clusters, db_scores_log_pca_kmeans, label='4', c='yellow')
plt.plot(n_clusters, db_scores_log_pca_birch, label='5', c='purple')
plt.plot(n_clusters_dbscan1[1:4], db_scores_log_pca_dbscan[1:4], label='6', c='teal')
plt.legend()
plt.xlabel('Number of Clusters')
plt.ylabel('Davies-Bouldin Score')
plt.title('Number of Clusters vs. Davies-Bouldin Score - Tx Data')
plt.xticks(ticks=range(2, 13))
plt.grid(visible=True,axis='x')
plt.grid(visible=True,axis='y')
plt.show()