In [3]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import seaborn as sn
from kmodes.kmodes import KModes
from kmodes.kprototypes import KPrototypes
import seaborn as sns


In [9]:
def import_data(dummy, exclude):
    """import prepared dataset with or without dummy variables, 
    dummy: boolean
    exclude: list of variables to be deleted"""
    if dummy is True: 
        data_prep = pd.read_csv("C:\\Users\\Leni\\Documents\\UNI\\Data Challenges\\data\\analysis_dataset_dummy.csv", sep=";")
    else:
        data_prep = pd.read_csv("C:\\Users\\Leni\\Documents\\UNI\\Data Challenges\\data\\analysis_dataset.csv", sep=";")
    
    coin_clusters = data_prep[['coin']].copy()
    del data_prep["coin"]
    del data_prep["findspot"] #empty column
    del data_prep["Unnamed: 0"]
    
    for var in exclude:
        del data_prep[var]
        
    print("/data imported")
    
    return data_prep, coin_clusters


def mat_cat(data):
    """Dummy Variable material zurückführen - TO BE CHECKED"""
    material_col = data[['material_ ae ', 'material_ ar ', 'material_ av ', 'material_ cu ', 'material_ el ','material_ pb ']].idxmax(axis=1)
    #### TODO: missing values !
    material_col = material_col.to_frame(name='material')
    material_col['material'] = material_col['material'].str.replace("material_ ", "")
    data_material_col = data.join(material_col)
    for mat in ['material_ ae ', 'material_ ar ', 'material_ av ', 'material_ cu ', 'material_ el ','material_ pb ']:
        del data_material_col[mat]

    # cat encoding for material
    data_material_col['material'] = data_material_col['material'].astype('category')
    data_material_col[f'mat_cat'] = data_material_col['material'].cat.codes
    #data_material_col.drop(['material'], axis=1)
    del data_material_col['material']
    print('data_material_col')
    
    features = data_material_col.columns #column names
    index = data_material_col.index
    
    return data_material_col, features, index


def imput_it(data):
    """Imputieren von fehlenden Werten mit iterativem Imputer (gleich dbscan)"""
    imp = IterativeImputer(max_iter=10, random_state=0)
    imp_data = data.copy()
    head = data.columns
    for col in data.columns:
        imp.fit(np.array(data[col]).reshape(-1,1))
        imp_data[col] = imp.transform(np.array(data[col]).reshape(-1,1))
    imp_data = pd.DataFrame(imp_data, columns=head)

    print("/data imputed")
    return imp_data

def imput_it_test(data, no_cat=False):
    """Imputieren von fehlenden Werten mit iterativem Imputer (gleich dbscan)"""
    imp = IterativeImputer(max_iter=10, random_state=0)
    imp_data = data.copy()
    head = data.columns
    for col in data.columns:
        if data[col].dtype != 'category':
            print('test')
        imp.fit(np.array(data[col]).reshape(-1,1))
        imp_data[col] = imp.transform(np.array(data[col]).reshape(-1,1))
    imp_data = pd.DataFrame(imp_data, columns=head)

    print("/data imputed")
    return imp_data


def scale_st(data, features, index):
    """Daten skalieren"""
    # scale numeric data, MW 0 Std 1
    scaler = StandardScaler()
    scal_data = pd.DataFrame(scaler.fit_transform(data), columns=features, index=index) #ndarray, nan 

    # TODO: handle categorical data?

    print("/data scaled")
    
    return scal_data

def apply_kmeans(data, number_clusters):
    """kmeans algorithm, standard settings, only for nummeric data, use scaled data"""
    kmeans_kwargs = {
        "init": "random",
        "n_init": 10,
        "max_iter": 300,
        "random_state": 42,}

    kmeans = KMeans(n_clusters=n, **kmeans_kwargs)
    kmeans.fit(scal_imp_data)
    #data_clusters['labels_kmeans'] = kmeans.labels_
    #clusters_out = coins['labels_kmeans'] = kmeans.labels_
    print('/data kMeans clustered')
    print("min SSE:", kmeans.inertia_ )
    print("#iterations: ", kmeans.n_iter_)
    
    #print("cluster centers: ", kmeans.cluster_centers_)
    
    return kmeans.labels_

def apply_kmodes(data, number_clusters, cat_columns):
    """cat_columns : list e.g. [5,6,7,8]"""
    kproto = KPrototypes(n_clusters=number_clusters, init='Cao')
    clusters = kproto.fit_predict(data, categorical=cat_columns) #column indices that are categorical 
    
    print('/data kModes clustered')
    print("min SSE:") #?
    print("#iterations: ", kproto.n_iter_)
    #print("cluster centers: ", kmeans.cluster_centers_)
    
    # TODO: The k-prototypes also accepts np.NaN values as missing values for the categorical variables, but doesnot accept missing values for the numerical values.
    
    return kproto.labels_

def plot_feat_matrix(data, col_labels, title, all=False):
    if all is True:
        features_anal1 = data.columns
        features_anal2 = data.columns
    else:
        features_anal1 = ['denom_cat', 'maxdiam', 'weight','enddate', 'startdate', 'axis', ]#,'denom_cat', 'mint_cat', 'collection_cat' ]
        features_anal2 = ['weight', 'startdate', 'denom_cat', 'maxdiam']
    fig, axs = plt.subplots(len(features_anal1), len(features_anal2), sharex=False, sharey=False, figsize=(20, 20), constrained_layout=True)
    y, x = 0, 0

    for feature1 in features_anal1:
        for feature2 in features_anal2:
            if not feature1 == feature2:
                try:
                    axs[x, y].scatter(data[feature1], data[feature2], c=data[col_labels], alpha=0.6)
                    axs[x, y].set_xlabel(feature1)
                    axs[x, y].set_ylabel(feature2)
                except:
                    next
            y += 1
        y = 0
        x += 1

    fig.suptitle(title, fontsize=16)
    plt.show()
    plt.savefig(f'{col_labels}_plot.png')
    print('/plot saved')
    
    

def describe_feat_agg(data, col_labels):
    """mean and count per variable per cluster"""
    data_agg = data.groupby([col_labels]).agg(["mean", "count"])
    print('results per cluster')
    print(data_agg)

In [61]:
variant = 'kmeans_allvars'

# import and impute, scale data
exclude_var = []
data_prep, coin_clusters = import_data(dummy=False, exclude=exclude_var)
scal_imp_data = scale_st(imput_it(data_prep), features=data_prep.columns, index=data_prep.index)

#cluster data
n = 8 #number of clusters
labels = apply_kmeans(data=scal_imp_data, number_clusters=n)
coin_clusters[variant] = labels

#save results (coinsIDs + cluster labels)
coin_clusters.to_csv(f"clusters_{variant}.csv", sep=";")

# explore results
results = pd.concat([data_prep, coin_clusters], axis=1)
describe_feat_agg(data=results, col_labels=variant)
plot_feat_matrix(data=results, col_labels=variant, title="kMeans Clustering with all Variables, stand/imputet, no dummy")

/data imported
/data imputed


/data scaled


/data kMeans clustered
min SSE: 189958.34120381786
#iterations:  18


results per cluster
                  maxdiam          mindiam           weight           enddate  \
                     mean count       mean count       mean count        mean   
kmeans_allvars                                                                  
0               13.586995  4443  13.700294  2649   2.805155  5065 -357.426135   
1               22.285049  3995  21.735390  2449   7.011694  5714  160.326185   
2               22.917754  5235  21.797542  3673   7.617776  5152  184.068290   
3               27.670623  2407  26.620000  1035  14.778812  3367 -240.849183   
4               15.214088  1162  15.931760   233   8.688426   896 -410.452114   
5               21.496699  4387  20.271709  2639   6.672134  5910  155.413422   
6               15.836723  5307  15.890056  2158   3.816252  5598 -219.529085   
7               32.443036  3986  31.288425  3510  20.478835  3664  201.790597   

                       startdate        ... material_cat        denom_cat  \
           

/plot saved


In [62]:
variant = 'kmeans_noCollection'

# import and impute, scale data
exclude_var = ['collection_cat']
data_prep, coin_clusters = import_data(dummy=False, exclude=exclude_var)
scal_imp_data = scale_st(imput_it(data_prep), features=data_prep.columns, index=data_prep.index)

#cluster data
n = 8 #number of clusters
labels = apply_kmeans(data=scal_imp_data, number_clusters=n)
coin_clusters[variant] = labels

#save results (coinsIDs + cluster labels)
coin_clusters.to_csv(f"clusters_{variant}.csv", sep=";")

# explore results
results = pd.concat([data_prep, coin_clusters], axis=1)
describe_feat_agg(data=results, col_labels=variant)
plot_feat_matrix(data=results, col_labels=variant, title="kMeans Clustering with all Variables, stand/imputet, no dummy")

/data imported
/data imputed
/data scaled


/data kMeans clustered
min SSE: 151467.6229559213
#iterations:  26


results per cluster
                       maxdiam          mindiam           weight        \
                          mean count       mean count       mean count   
kmeans_noCollection                                                      
0                    15.318875  5714  15.543559  2492   3.652083  6064   
1                    13.788958  3617  13.716399  2291   2.835444  4146   
2                    32.166566  4281  30.990718  3749  20.097854  3863   
3                    15.214228  1069  16.495035   141   9.002646   805   
4                    19.736265  1826  20.191817   798   5.862431  1859   
5                    21.758167  7156  20.504195  4880   6.757835  8749   
6                    22.917091  4833  22.273006  2967   7.370159  6491   
7                    27.802016  2426  26.782101  1028  14.826073  3389   

                        enddate         startdate             axis        \
                           mean count        mean count       mean count   
kmeans_noColl

/plot saved


In [72]:
variant = 'kmeans_excl_corrVars'
#corr(mindiam, maxdiam)= 0,99, corr(weight, mindiam)=0.9, corr(weight, maxdiam)=0,88
#fillratio weight 86%, maxdiam 75%, mindiam 44% 
#corr(startdate, enddate)= 0,99, equal fillratios 

# import and impute, scale data
exclude_var = ['mindiam', 'startdate', 'maxdiam', 'collection_cat']
data_prep, coin_clusters = import_data(dummy=False, exclude=exclude_var)
scal_imp_data = scale_st(imput_it(data_prep), features=data_prep.columns, index=data_prep.index)

#cluster data
n = 8 #number of clusters
labels = apply_kmeans(data=scal_imp_data, number_clusters=n)
coin_clusters[variant] = labels

#save results (coinsIDs + cluster labels)
coin_clusters.to_csv(f"clusters_{variant}.csv", sep=";")

# explore results
results = pd.concat([data_prep, coin_clusters], axis=1)
describe_feat_agg(data=results, col_labels=variant)
sns.pairplot(results, hue=variant)
plt.show()
#plot_feat_matrix(all=True, data=results, col_labels=variant, title="kMeans Clustering with Variables excl highly correlated variables, stand/imputet, no dummy")

/data imported
/data imputed
/data scaled


/data kMeans clustered
min SSE: 86881.87876657107
#iterations:  39


results per cluster
                         weight            enddate              axis        \
                           mean  count        mean  count       mean count   
kmeans_excl_corrVars                                                         
0                     11.998474   3119  -66.473928   2589  10.542502  1047   
1                      4.249747   5139 -373.882126   5175   7.795154  1362   
2                      5.542562   4157  -60.834145   3491  11.799188  4188   
3                      7.329809  10396  121.293765  10842   6.520490  2367   
4                      6.709898   7325   38.802973   6390   6.581979  2153   
5                      7.607613   2371    2.955492   2112   1.574583  2400   
6                     26.720045   2016  153.097833   1615   7.033829   739   
7                      8.702100    843 -414.675749   1101  10.000000     3   

                     material_cat         denom_cat          mint_cat         \
                             mean  count 

In [10]:
variant = 'kmodes_noCollection'
# import and impute, scale data
exclude_var = ['collection_cat']
data_prep, coin_clusters = import_data(dummy=False, exclude=exclude_var)
scal_imp_data = scale_st(imput_it_test(data_prep), features=data_prep.columns, index=data_prep.index)
#TODO: impute only numeric values, handle cat nan as own category

#cluster data
n = 8 #number of clusters
labels = apply_kmodes(data=scal_imp_data, number_clusters=n, cat_columns=[5, 6, 7, 8])
coin_clusters[variant] = labels

#save results (coinsIDs + cluster labels)
coin_clusters.to_csv(f"clusters_{variant}.csv", sep=";")

# explore results



/data imported


TypeError: data type "category" not understood

In [None]:
#'maxdiam', 'weight', 'enddate', 'startdate', 'axis',
#'material_ ae ','material_ ar ', 'material_ av ', 'material_ cu ', 'material_ el ', #'material_ pb ', 
#'denom_cat', 'mint_cat', 'collection_cat']


    
#data = pd.concat([data_raw, data_clusters], axis=1)
#print(pd.DataFrame(kproto.labels_)[0].value_counts())
##algo = 'kmodes'
#algo = 'kmeans'

#data_agg = data.groupby(f"labels_{algo}").agg(["mean", "count"])
#print(data_agg)
#plot_feat_matrix(data, algo, "test")



In [None]:
max_clusters = 15
scal_imp_data = scale_st(imput_it(data_prep), features_prep, index_prep) #imputieren, skalieren

sse = []
for k in range(1, max_clusters):
    kmeans.fit(scal_imp_data)
    sse.append(kmeans.inertia_)

plt.plot(range(1, max_clusters), sse)
plt.xticks(range(1, max_clusters))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()


#elbow point
kl = KneeLocator(
    range(1, max_clusters), sse, curve="convex", direction="decreasing")
print("elbow point KneeLocator: ", kl.elbow)


