In [None]:
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN, OPTICS, Birch
from sklearn import decomposition
import numpy as np
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# read data

In [None]:
graph_data = pd.read_csv('../data/graph_data_out.csv')
graph_df = pd.DataFrame(data=graph_data)
graph_df = graph_df.set_index('address')
graph_df.head()

In [None]:
graph_data_log = pd.read_csv('../data/graph_data_out_log.csv')
graph_data_log = graph_data_log.set_index('address')
graph_data_log.head()

# method 0: raw kmeans

In [None]:
graph_data = graph_data.set_index('address')

In [None]:
pca = decomposition.PCA(n_components=2)
pca.fit(graph_data)
transformed_df = pca.transform(graph_data)

x_vals = []
y_vals = []
for val in transformed_df:
    x_vals.append(val[0])
    y_vals.append(val[1])



sil_scores_raw_kmeans = []
ch_scores_raw_kmeans = []
db_scores_raw_kmeans = []
n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

for cluster_size in n_clusters:
    kmeans = KMeans(n_clusters=cluster_size)
    kmeans.fit(graph_data)

    #number of addresses per cluster
    print('Addresses per cluster')
    print(pd.Series(kmeans.labels_).value_counts())

    #scores
    sil_score = silhouette_score(graph_data, kmeans.labels_, sample_size=100000)
    sil_scores_raw_kmeans.append(sil_score)
    print('Silhouette score:', sil_score)

    ch_score = calinski_harabasz_score(graph_data, kmeans.labels_)
    ch_scores_raw_kmeans.append(ch_score)
    print('Calinski-Harabasz score:', ch_score)

    db_score = davies_bouldin_score(graph_data, kmeans.labels_)
    db_scores_raw_kmeans.append(db_score)
    print('Davies-Bouldin score:', db_score)
    
#     if cluster_size == 4:
#         #export unique address with clusters
#         top_addr_df['clusters'] = kmeans.labels_
#         top_addr_df['pca_x_vals'] = x_vals
#         top_addr_df['pca_y_vals'] = y_vals
#         print(top_addr_df.head(5))
#         top_addr_df.to_csv('addresses_pca_kmeans.csv')


    
    
    #plot addresses
    plt.figure(figsize=[15,15])
    plt.scatter(x_vals, y_vals, c=kmeans.labels_)
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('Addresses in {} clusters'.format(cluster_size))
    plt.show()

# method 1: pca, kmeans

In [None]:
#dimensionality reduction
pca = decomposition.PCA(n_components=2)
pca.fit(graph_data_log)
transformed_df = pca.transform(graph_data_log)

x_vals = []
y_vals = []
for val in transformed_df:
    x_vals.append(val[0])
    y_vals.append(val[1])


In [None]:
sil_scores_log_pca_kmeans = []
ch_scores_log_pca_kmeans = []
db_scores_log_pca_kmeans = []
n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

for cluster_size in n_clusters:
    kmeans = KMeans(n_clusters=cluster_size)
    kmeans.fit(transformed_df)

    #number of addresses per cluster
    print('Addresses per cluster')
    print(pd.Series(kmeans.labels_).value_counts())

    #scores
    sil_score = silhouette_score(transformed_df, kmeans.labels_, sample_size=100000)
    sil_scores_log_pca_kmeans.append(sil_score)
    print('Silhouette score:', sil_score)

    ch_score = calinski_harabasz_score(transformed_df, kmeans.labels_)
    ch_scores_log_pca_kmeans.append(ch_score)
    print('Calinski-Harabasz score:', ch_score)

    db_score = davies_bouldin_score(transformed_df, kmeans.labels_)
    db_scores_log_pca_kmeans.append(db_score)
    print('Davies-Bouldin score:', db_score)
    
#     if cluster_size == 4:
#         #export unique address with clusters
#         top_addr_df['clusters'] = kmeans.labels_
#         top_addr_df['pca_x_vals'] = x_vals
#         top_addr_df['pca_y_vals'] = y_vals
#         print(top_addr_df.head(5))
#         top_addr_df.to_csv('addresses_pca_kmeans.csv')


    
    
    #plot addresses
    plt.figure(figsize=[30,30])
    plt.scatter(x_vals, y_vals, c=kmeans.labels_)
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('Addresses in {} clusters'.format(cluster_size))
    plt.show()

# method 2: tsne, kmeans

In [None]:
X_embedded = TSNE(n_components=2).fit_transform(graph_data_log)

x_vals = []
y_vals = []
for val in X_embedded:
    x_vals.append(val[0])
    y_vals.append(val[1])

In [None]:
sil_scores_log_pca_kmeans = []
ch_scores_log_pca_kmeans = []
db_scores_log_pca_kmeans = []
n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

for cluster_size in n_clusters:
    kmeans = KMeans(n_clusters=cluster_size)
    kmeans.fit(X_embedded)

    #number of addresses per cluster
    print('Addresses per cluster')
    print(pd.Series(kmeans.labels_).value_counts())

    #scores
    sil_score = silhouette_score(X_embedded, kmeans.labels_, sample_size=100000)
    sil_scores_log_pca_kmeans.append(sil_score)
    print('Silhouette score:', sil_score)

    ch_score = calinski_harabasz_score(X_embedded, kmeans.labels_)
    ch_scores_log_pca_kmeans.append(ch_score)
    print('Calinski-Harabasz score:', ch_score)

    db_score = davies_bouldin_score(X_embedded, kmeans.labels_)
    db_scores_log_pca_kmeans.append(db_score)
    print('Davies-Bouldin score:', db_score)
    
#     if cluster_size == 4:
#         #export unique address with clusters
#         top_addr_df['clusters'] = kmeans.labels_
#         top_addr_df['pca_x_vals'] = x_vals
#         top_addr_df['pca_y_vals'] = y_vals
#         print(top_addr_df.head(5))
#         top_addr_df.to_csv('addresses_pca_kmeans.csv')


    
    
    #plot addresses
    plt.figure(figsize=[15,15])
    plt.scatter(x_vals, y_vals, c=kmeans.labels_)
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('Addresses in {} clusters'.format(cluster_size))
    plt.show()

In [None]:
'''
The clusters are too dispersed, we need to perform feature selection.
'''

# method 3: feature selection, tsne, kmeans

In [None]:
graph_df2 = pd.DataFrame(data=graph_data_log.index)
graph_df2 = graph_df2.set_index('address')
# graph_df2['embed1'] = graph_df['embed1']
# graph_df2['embed2'] = graph_df['embed2']
# graph_df2['closeness'] = graph_df['closeness']
graph_df2['betweeness'] = graph_data_log['betweeness']
# graph_df2['page_rank'] = graph_df['page_rank']
graph_df2['in_degree'] = graph_data_log['in_degree']
graph_df2['out_degree'] = graph_data_log['out_degree']

# graph_df2['send_count'] = tx_df_log['send_count']
# graph_df2['recv_count'] = tx_df_log['receive_count']
# graph_df2['avg_gas_cost'] = tx_df_log['avg_gas_cost']
# graph_df2['max_nonce'] = tx_df_log['max_nonce']
# graph_df2['total_eth_trans'] = tx_df_log['total_eth_trans']
# graph_df2['total_eth_sent'] = tx_df_log['total_eth_sent']
# graph_df2['total_eth_recv'] = tx_df_log['total_eth_recv']
graph_df2

In [None]:
X_embedded = TSNE(n_components=2).fit_transform(graph_df2)

x_vals = []
y_vals = []
for val in X_embedded:
    x_vals.append(val[0])
    y_vals.append(val[1])

In [None]:
sil_scores_tsne_kmeans = []
ch_scores_tsne_kmeans = []
db_scores_tsne_kmeans = []
n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

for cluster_size in n_clusters:
    kmeans = KMeans(n_clusters=cluster_size)
    kmeans.fit(X_embedded)

    #number of addresses per cluster
    print('Addresses per cluster')
    print(pd.Series(kmeans.labels_).value_counts())

    #scores
    sil_score = silhouette_score(transformed_df, kmeans.labels_, sample_size=100000)
    sil_scores_tsne_kmeans.append(sil_score)
    print('Silhouette score:', sil_score)

    ch_score = calinski_harabasz_score(transformed_df, kmeans.labels_)
    ch_scores_tsne_kmeans.append(ch_score)
    print('Calinski-Harabasz score:', ch_score)

    db_score = davies_bouldin_score(transformed_df, kmeans.labels_)
    db_scores_tsne_kmeans.append(db_score)
    print('Davies-Bouldin score:', db_score)
    
#     if cluster_size == 4:
#         #export unique address with clusters
#         top_addr_df['clusters'] = kmeans.labels_
#         top_addr_df['pca_x_vals'] = x_vals
#         top_addr_df['pca_y_vals'] = y_vals
#         print(top_addr_df.head(5))
#         top_addr_df.to_csv('addresses_pca_kmeans.csv')


    
    
    #plot addresses
    plt.figure(figsize=[15,15])
    plt.scatter(x_vals, y_vals, c=kmeans.labels_)
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('Addresses in {} clusters'.format(cluster_size))
    plt.show()

# method 3.5: pca, kmeans

In [None]:
#dimensionality reduction
pca = decomposition.PCA(n_components=2)
pca.fit(graph_df2)
transformed_df = pca.transform(graph_df2)

x_vals = []
y_vals = []
for val in transformed_df:
    x_vals.append(val[0])
    y_vals.append(val[1])


In [None]:
sil_scores_pca_kmeans = []
ch_scores_pca_kmeans = []
db_scores_pca_kmeans = []
n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

for cluster_size in n_clusters:
    kmeans = KMeans(n_clusters=cluster_size)
    kmeans.fit(transformed_df)

    #number of addresses per cluster
    print('Addresses per cluster')
    print(pd.Series(kmeans.labels_).value_counts())

    #scores
    sil_score = silhouette_score(transformed_df, kmeans.labels_, sample_size=100000)
    sil_scores_pca_kmeans.append(sil_score)
    print('Silhouette score:', sil_score)

    ch_score = calinski_harabasz_score(transformed_df, kmeans.labels_)
    ch_scores_pca_kmeans.append(ch_score)
    print('Calinski-Harabasz score:', ch_score)

    db_score = davies_bouldin_score(transformed_df, kmeans.labels_)
    db_scores_pca_kmeans.append(db_score)
    print('Davies-Bouldin score:', db_score)
    
    if cluster_size == 4:
        #export unique address with clusters
        export_df = graph_df.copy()
        export_df['clusters'] = kmeans.labels_
        export_df['pca_x_vals'] = x_vals
        export_df['pca_y_vals'] = y_vals
        print(export_df.head(5))
#         export_df.to_csv('addresses_pca_kmeans_graph.csv')

    plot_df = pd.DataFrame()
    plot_df['x_vals'] = x_vals
    plot_df['y_vals'] = y_vals
    plot_df['cluster'] = kmeans.labels_
    
    
    #plot addresses
    plt.rc('font', size=20)
    plt.figure(figsize=(10,10))
    plt.scatter(plot_df['x_vals'][plot_df['cluster'] == 0], plot_df['y_vals'][plot_df['cluster'] == 0], label='0',c='gold',alpha=0.5)
    plt.scatter(plot_df['x_vals'][plot_df['cluster'] == 1], plot_df['y_vals'][plot_df['cluster'] == 1], label='1',c='hotpink',alpha=0.5)
    try:
        plt.scatter(plot_df['x_vals'][plot_df['cluster'] == 2], plot_df['y_vals'][plot_df['cluster'] == 2], label='2',c='purple',alpha=0.5)
        plt.scatter(plot_df['x_vals'][plot_df['cluster'] == 3], plot_df['y_vals'][plot_df['cluster'] == 3], label='3',c='green',alpha=0.5)
    except:
        continue
    plt.legend()    
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('Method 4, Graph statistics, {} clusters'.format(cluster_size))
    plt.show()

# method 4: pca, birch

In [None]:
from sklearn.cluster import Birch

sil_scores_pca_birch = []
ch_scores_pca_birch = []
db_scores_pca_birch = []
n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

#dimensionality reduction
pca = decomposition.PCA(n_components=2)
pca.fit(graph_df2)
transformed_df = pca.transform(graph_df2)

x_vals = []
y_vals = []
for val in transformed_df:
    x_vals.append(val[0])
    y_vals.append(val[1])


for cluster_size in n_clusters:
    birch = Birch(n_clusters=cluster_size)
    birch.fit(transformed_df)
    

    #number of addresses per cluster
    print('Addresses per cluster')
    print(pd.Series(birch.labels_).value_counts())

    #scores
    sil_score = silhouette_score(transformed_df, birch.labels_, sample_size=100000)
    sil_scores_pca_birch.append(sil_score)
    print('Silhouette score:', sil_score)

    ch_score = calinski_harabasz_score(transformed_df, birch.labels_)
    ch_scores_pca_birch.append(ch_score)
    print('Calinski-Harabasz score:', ch_score)

    db_score = davies_bouldin_score(transformed_df, birch.labels_)
    db_scores_pca_birch.append(db_score)
    print('Davies-Bouldin score:', db_score)
    

    
    #plot addresses
    plt.figure(figsize=[30,30])
    plt.scatter(x_vals, y_vals, c=birch.labels_)
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('Addresses in {} clusters after log t, pca, and birch clustering'.format(cluster_size))
    plt.show()
    

# method 5: tsne, birch

In [None]:
X_embedded = TSNE(n_components=2).fit_transform(graph_df2)

x_vals = []
y_vals = []
for val in X_embedded:
    x_vals.append(val[0])
    y_vals.append(val[1])

In [None]:
sil_scores_tsne_birch = []
ch_scores_tsne_birch = []
db_scores_tsne_birch = []
n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
for cluster_size in n_clusters:
    birch = Birch(n_clusters=cluster_size)
    birch.fit(transformed_df)
    

    #number of addresses per cluster
    print('Addresses per cluster')
    print(pd.Series(birch.labels_).value_counts())

    #scores
    sil_score = silhouette_score(transformed_df, birch.labels_, sample_size=100000)
    sil_scores_tsne_birch.append(sil_score)
    print('Silhouette score:', sil_score)

    ch_score = calinski_harabasz_score(transformed_df, birch.labels_)
    ch_scores_tsne_birch.append(ch_score)
    print('Calinski-Harabasz score:', ch_score)

    db_score = davies_bouldin_score(transformed_df, birch.labels_)
    db_scores_tsne_birch.append(db_score)
    print('Davies-Bouldin score:', db_score)
    

    
    #plot addresses
    plt.figure(figsize=[15,15])
    plt.scatter(x_vals, y_vals, c=birch.labels_)
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('Addresses in {} clusters after log t, pca, and birch clustering'.format(cluster_size))
    plt.show()
    

# method 6: pca, dbscan

In [None]:
#dimensionality reduction
pca = decomposition.PCA(n_components=2)
pca.fit(graph_df2)
transformed_df = pca.transform(graph_df2)

x_vals = []
y_vals = []
for val in transformed_df:
    x_vals.append(val[0])
    y_vals.append(val[1])

In [None]:
sil_scores_pca_dbscan = []
ch_scores_pca_dbscan = []
db_scores_pca_dbscan = []
eps_values = [.2, .4, .5, 1, 2]
n_clusters_pca_dbscan = []

for eps_val in eps_values:
    dbscan = DBSCAN(eps=eps_val)
    dbscan.fit(transformed_df)
    
    #number of addresses per cluster
    print('Addresses per cluster')
    print(pd.Series(dbscan.labels_).value_counts())
    n_clusters_pca_dbscan.append(len(pd.Series(dbscan.labels_).value_counts()))

    #scores
    sil_score = silhouette_score(transformed_df, dbscan.labels_, sample_size=100000)
    sil_scores_pca_dbscan.append(sil_score)
    print('Silhouette score:', sil_score)

    ch_score = calinski_harabasz_score(transformed_df, dbscan.labels_)
    ch_scores_pca_dbscan.append(ch_score)
    print('Calinski-Harabasz score:', ch_score)

    db_score = davies_bouldin_score(transformed_df, dbscan.labels_)
    db_scores_pca_dbscan.append(db_score)
    print('Davies-Bouldin score:', db_score)

    #plot addresses
    plt.figure(figsize=[15,15])
    plt.scatter(x_vals, y_vals, c=dbscan.labels_)
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('Addresses clustered with DBSCAN and epsilon value {}'.format(eps_val))
    plt.show()

# method 7: tsne, dbscan

In [None]:
X_embedded = TSNE(n_components=2).fit_transform(graph_df2)

x_vals = []
y_vals = []
for val in X_embedded:
    x_vals.append(val[0])
    y_vals.append(val[1])

In [None]:
sil_scores_tsne_dbscan = []
ch_scores_tsne_dbscan = []
db_scores_tsne_dbscan = []
eps_values = [.2, .4, .5, 1, 2, 3]
n_clusters_tsne_dbscan = []

for eps_val in eps_values:
    dbscan = DBSCAN(eps=eps_val)
    dbscan.fit(X_embedded)
    
    #number of addresses per cluster
    print('Addresses per cluster')
    print(pd.Series(dbscan.labels_).value_counts())
    n_clusters_tsne_dbscan.append(len(pd.Series(dbscan.labels_).value_counts()))

    #scores
    sil_score = silhouette_score(X_embedded, dbscan.labels_, sample_size=100000)
    sil_scores_tsne_dbscan.append(sil_score)
    print('Silhouette score:', sil_score)

    ch_score = calinski_harabasz_score(X_embedded, dbscan.labels_)
    ch_scores_tsne_dbscan.append(ch_score)
    print('Calinski-Harabasz score:', ch_score)

    db_score = davies_bouldin_score(X_embedded, dbscan.labels_)
    db_scores_tsne_dbscan.append(db_score)
    print('Davies-Bouldin score:', db_score)

    #plot addresses
    plt.figure(figsize=[15,15])
    plt.scatter(x_vals, y_vals, c=dbscan.labels_)
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('Addresses clustered with DBSCAN and epsilon value {}'.format(eps_val))
    plt.show()

In [None]:
#plot all three scores
plt.figure(figsize=(15,15))
plt.plot(n_clusters, sil_scores_tsne_kmeans, c='red')
# plt.plot(n_clusters_log_dbscan, sil_scores_log_dbscan, c='blue')
plt.plot(n_clusters, sil_scores_pca_kmeans, c='yellow')
# plt.plot(n_clusters, sil_scores_log_pca_kmeans, c='green')
plt.plot(n_clusters, sil_scores_pca_birch, c='purple')
plt.plot(n_clusters, sil_scores_tsne_birch, c='pink')
plt.plot(n_clusters_pca_dbscan[1:4], sil_scores_pca_dbscan[1:4], c='teal')
# plt.plot(n_clusters_tsne_dbscan, sil_scores_tsne_dbscan, c='black')
plt.title('Number of Clusters vs. Silhouette Score')
plt.xticks(ticks=range(12))
plt.grid(visible=True,axis='x')
plt.show()

plt.figure(figsize=(15,15))
plt.plot(n_clusters, ch_scores_tsne_kmeans, c='red')
plt.plot(n_clusters, ch_scores_pca_kmeans, c='yellow')
# plt.plot(n_clusters_log_dbscan, ch_scores_log_dbscan, c='blue')
# plt.plot(n_clusters, ch_scores_log_pca_kmeans, c='green')
plt.plot(n_clusters, ch_scores_pca_birch, c='purple')
plt.plot(n_clusters, ch_scores_tsne_birch, c='pink')
plt.plot(n_clusters_pca_dbscan[1:4], ch_scores_pca_dbscan[1:4], c='teal')
# plt.plot(n_clusters_tsne_dbscan, ch_scores_tsne_dbscan, c='black')
plt.title('Number of Clusters vs. Calinski-Harabasz Score')
plt.xticks(ticks=range(12))
plt.grid(visible=True,axis='x')
plt.show()

plt.figure(figsize=(15,15))
plt.plot(n_clusters, db_scores_tsne_kmeans, c='red')
# plt.plot(n_clusters_log_dbscan, db_scores_log_dbscan, c='blue')
plt.plot(n_clusters, db_scores_pca_kmeans, c='yellow')
# plt.plot(n_clusters, db_scores_log_pca_kmeans, c='green')
# plt.plot(n_clusters, ch_scores_pca_birch, c='purple')
plt.plot(n_clusters, db_scores_tsne_birch, c='pink')
plt.plot(n_clusters_pca_dbscan[1:4], db_scores_pca_dbscan[1:4], c='teal')
# plt.plot(n_clusters_tsne_dbscan, db_scores_tsne_dbscan, c='black')
plt.title('Number of Clusters vs. Davies-Bouldin Score')
plt.xticks(ticks=range(12))
plt.grid(visible=True,axis='x')
plt.show()

# method 8: log, kmeans, pca

In [None]:
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
sil_scores_log_kmeans = []
ch_scores_log_kmeans = []
db_scores_log_kmeans = []
n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

#dimensionality reduction
pca = decomposition.PCA(n_components=2)
pca.fit(graph_df2)
transformed_df = pca.transform(graph_df2)

x_vals = []
y_vals = []
for val in transformed_df:
    x_vals.append(val[0])
    y_vals.append(val[1])


for cluster_size in n_clusters:
    kmeans = KMeans(n_clusters=cluster_size)
    kmeans.fit(graph_df2)
    

    #number of addresses per cluster
    print('Addresses per cluster')
    print(pd.Series(kmeans.labels_).value_counts())

    #scores
    sil_score = silhouette_score(graph_df2, kmeans.labels_, sample_size=100000)
    sil_scores_log_kmeans.append(sil_score)
    print('Silhouette score:', sil_score)

    ch_score = calinski_harabasz_score(graph_df2, kmeans.labels_)
    ch_scores_log_kmeans.append(ch_score)
    print('Calinski-Harabasz score:', ch_score)

    db_score = davies_bouldin_score(graph_df2, kmeans.labels_)
    db_scores_log_kmeans.append(db_score)
    print('Davies-Bouldin score:', db_score)
    

    
    #plot addresses
    plt.figure(figsize=[15,15])
    plt.scatter(x_vals, y_vals, c=kmeans.labels_)
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('Addresses in {} clusters after log transform and kmeans clustering'.format(cluster_size))
    plt.show()
    


# method 9: log, birch, pca

In [None]:
from sklearn.cluster import Birch

sil_scores_log_birch = []
ch_scores_log_birch = []
db_scores_log_birch = []
n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

#dimensionality reduction
pca = decomposition.PCA(n_components=2)
pca.fit(graph_df2)
transformed_df = pca.transform(graph_df2)

x_vals = []
y_vals = []
for val in transformed_df:
    x_vals.append(val[0])
    y_vals.append(val[1])


for cluster_size in n_clusters:
    birch = Birch(n_clusters=cluster_size)
    birch.fit(graph_df2)
    

    #number of addresses per cluster
    print('Addresses per cluster')
    print(pd.Series(birch.labels_).value_counts())

    #scores
    sil_score = silhouette_score(graph_df2, birch.labels_, sample_size=100000)
    sil_scores_log_birch.append(sil_score)
    print('Silhouette score:', sil_score)

    ch_score = calinski_harabasz_score(graph_df2, birch.labels_)
    ch_scores_log_birch.append(ch_score)
    print('Calinski-Harabasz score:', ch_score)

    db_score = davies_bouldin_score(graph_df2, birch.labels_)
    db_scores_log_birch.append(db_score)
    print('Davies-Bouldin score:', db_score)
    

    
    #plot addresses
    plt.figure(figsize=[15,15])
    plt.scatter(x_vals, y_vals, c=birch.labels_)
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('Addresses in {} clusters after log transform and birch clustering'.format(cluster_size))
    plt.show()
    

In [None]:
#create df for clustering results
sil_results_df = pd.DataFrame()
sil_results_df['n_clusters'] = range(2,12)
sil_results_df = sil_results_df.set_index('n_clusters')
sil_results_df['raw k-means'] = [x.round(2) for x in sil_scores_raw_kmeans]
sil_results_df['log, k-means'] = [x.round(2) for x in sil_scores_log_kmeans]
sil_results_df['log, BIRCH'] = [x.round(2) for x in sil_scores_log_birch]
sil_results_df['log, PCA, k-means'] = [x.round(2) for x in sil_scores_pca_kmeans]
sil_results_df['log, PCA, BIRCH'] = [x.round(2) for x in sil_scores_pca_birch]
sil_results_df['log, PCA, DBSCAN'] = [0.91, None, 0.88, None, 0.84, None, None, None, None, None]
#add dbscan
sil_results_df.head(10)

In [None]:
#create df for clustering results
ch_results_df = pd.DataFrame()
ch_results_df['n_clusters'] = range(2,12)
ch_results_df = ch_results_df.set_index('n_clusters')
ch_results_df['raw k-means'] = [x.round(2) for x in ch_scores_raw_kmeans]
ch_results_df['log, k-means'] = [x.round(2) for x in ch_scores_log_kmeans]
ch_results_df['log, BIRCH'] = [x.round(2) for x in ch_scores_log_birch]
ch_results_df['log, PCA, k-means'] = [x.round(2) for x in ch_scores_pca_kmeans]
ch_results_df['log, PCA, BIRCH'] = [x.round(2) for x in ch_scores_pca_birch]
ch_results_df['log, PCA, DBSCAN'] = [476.28, None, 13739.76, None, 8702.96, None, None, None, None, None]
#add dbscan
ch_results_df.head(10)

In [None]:
#create df for clustering results
db_results_df = pd.DataFrame()
db_results_df['n_clusters'] = range(2,12)
db_results_df = db_results_df.set_index('n_clusters')
db_results_df['raw k-means'] = [x.round(2) for x in db_scores_raw_kmeans]
db_results_df['log, k-means'] = [x.round(2) for x in db_scores_log_kmeans]
db_results_df['log, BIRCH'] = [x.round(2) for x in db_scores_log_birch]
db_results_df['log, PCA, k-means'] = [x.round(2) for x in db_scores_pca_kmeans]
db_results_df['log, PCA, BIRCH'] = [x.round(2) for x in db_scores_pca_birch]
db_results_df['log, PCA, DBSCAN'] = [0.25, None, 0.57, None, 0.58, None, None, None, None, None]
#add dbscan
db_results_df.head(10)

In [None]:
print(n_clusters_pca_dbscan[1:4])
print([x.round(2) for x in db_scores_pca_dbscan[1:4]])

In [None]:
method4_df = pd.DataFrame()
method4_df['sil'] = sil_scores_pca_kmeans
method4_df['ch'] = ch_scores_pca_kmeans
method4_df['db'] = db_scores_pca_kmeans
# method4_df.to_csv('graph_m4_scores.csv')

In [None]:
#plot all three scores
plt.figure(figsize=(10,10))
plt.rc('font', size=16)
plt.plot(n_clusters, sil_scores_raw_kmeans, label='1',c='red')
plt.plot(n_clusters, sil_scores_log_kmeans, label='2',c='blue')
plt.plot(n_clusters, sil_scores_log_birch, label='3',c='green')
plt.plot(n_clusters, sil_scores_pca_kmeans, label='4',c='yellow')
plt.plot(n_clusters, sil_scores_pca_birch, label='5',c='purple')
# plt.plot(n_clusters, sil_scores_tsne_birch, label='t-SNE, BIRCH',c='pink')
plt.plot(n_clusters_pca_dbscan[1:4], sil_scores_pca_dbscan[1:4], label='6',c='teal')
# plt.plot(n_clusters_tsne_dbscan, sil_scores_tsne_dbscan, c='black')
plt.title('Number of Clusters vs. Silhouette Score - Graph Data')
plt.xticks(ticks=range(2, 12))
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.legend()
plt.grid(visible=True,axis='x')
plt.grid(visible=True,axis='y')
plt.show()

plt.figure(figsize=(10,10))
# plt.plot(n_clusters, ch_scores_raw_kmeans, label='1',c='red')
plt.plot(n_clusters, ch_scores_log_kmeans, label='2',c='blue')
plt.plot(n_clusters, ch_scores_log_birch, label='3',c='green')
plt.plot(n_clusters, ch_scores_pca_kmeans, label='4',c='yellow')
plt.plot(n_clusters, ch_scores_pca_birch, label='5',c='purple')
# plt.plot(n_clusters, sil_scores_tsne_birch, label='t-SNE, BIRCH',c='pink')
plt.plot(n_clusters_pca_dbscan[1:4], ch_scores_pca_dbscan[1:4], label='6',c='teal')
# plt.plot(n_clusters_tsne_dbscan, sil_scores_tsne_dbscan, c='black')
plt.title('Number of Clusters vs. Calinski-Harabasz Score - Graph Data')
plt.legend()
plt.xlabel('Number of Clusters')
plt.ylabel('Calinski-Harabasz Score')
plt.xticks(ticks=range(2, 12))
plt.grid(visible=True,axis='x')
plt.grid(visible=True,axis='y')
plt.show()

plt.figure(figsize=(10,10))
plt.plot(n_clusters, db_scores_raw_kmeans, label='1',c='red')
plt.plot(n_clusters, db_scores_log_kmeans, label='2',c='blue')
plt.plot(n_clusters, db_scores_log_birch, label='3',c='green')
plt.plot(n_clusters, db_scores_pca_kmeans, label='4',c='yellow')
plt.plot(n_clusters, db_scores_pca_birch, label='5',c='purple')
# plt.plot(n_clusters, sil_scores_tsne_birch, label='t-SNE, BIRCH',c='pink')
plt.plot(n_clusters_pca_dbscan[1:4], db_scores_pca_dbscan[1:4], label='6',c='teal')
# plt.plot(n_clusters_tsne_dbscan, sil_scores_tsne_dbscan, c='black')
plt.title('Number of Clusters vs. Davies-Bouldin Score - Graph Data')
plt.legend()
plt.xlabel('Number of Clusters')
plt.ylabel('Davies-Bouldin Score')
plt.xticks(ticks=range(2, 12))
plt.grid(visible=True,axis='x')
plt.grid(visible=True,axis='y')
plt.show()