In [2]:
import pandas as pd

data = pd.read_csv('data/data.csv')
data.describe()

Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,...,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28
count,98000.0,98000.0,98000.0,98000.0,98000.0,98000.0,98000.0,98000.0,98000.0,98000.0,...,98000.0,98000.0,98000.0,98000.0,98000.0,98000.0,98000.0,98000.0,98000.0,98000.0
mean,48999.5,0.00122,0.00558,-0.001042,-0.0007,-0.003522,-0.001612,-0.003042,5.545918,6.763061,...,-0.004513,-0.000515,-0.00167,-0.038752,-0.220002,0.166434,-0.064309,-0.06254,0.098472,-0.23091
std,28290.307527,1.002801,1.000742,1.001373,1.000422,1.003061,1.000532,0.997434,3.69184,4.152348,...,1.004372,1.002962,0.999703,1.477858,1.494836,1.543014,1.576086,1.428055,1.305407,1.528476
min,0.0,-4.732235,-4.202795,-4.377021,-4.010826,-4.535903,-4.300767,-4.894525,0.0,0.0,...,-4.894525,-4.732235,-4.43813,-6.873999,-8.234305,-7.792363,-6.593842,-7.375719,-7.335556,-6.954151
25%,24499.75,-0.675226,-0.670985,-0.672779,-0.67254,-0.68251,-0.675066,-0.680421,3.0,4.0,...,-0.678773,-0.679777,-0.675147,-1.022964,-1.203204,-0.903385,-1.128966,-0.97568,-0.746489,-1.262606
50%,48999.5,0.002022,0.00665,-0.000324,-0.003185,-0.003307,0.001024,-0.002053,5.0,6.0,...,-0.000587,-0.000806,0.000819,-0.056687,-0.219046,0.167074,-0.099221,-0.070852,0.08223,-0.271319
75%,73499.25,0.677271,0.677746,0.677086,0.672097,0.677589,0.673344,0.668112,8.0,9.0,...,0.672149,0.675437,0.676881,0.930158,0.76469,1.217432,0.987684,0.843212,0.925306,0.770516
max,97999.0,4.490521,4.324974,4.560247,4.399373,4.050549,4.710316,3.998595,32.0,30.0,...,4.560247,4.399373,4.135419,6.517721,6.054831,7.527271,7.544731,7.005608,7.205971,6.97715


## Preprocessing

In [27]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import make_column_transformer

data = data.drop('id', axis = 1)
categorical_cols = tuple(f'f_0{i}' if i < 10 else f'f_{i}' for i in range(7, 14))
numerical_cols = tuple(col for col in data.columns if col not in categorical_cols)
scale = make_column_transformer((StandardScaler(), numerical_cols), (OneHotEncoder(), categorical_cols))
data_norm = pd.DataFrame(scale.fit_transform(data).todense())
minMax = MinMaxScaler()
data_norm = pd.DataFrame(minMax.fit_transform(data_norm))
data_norm.describe()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,253,254,255,256,257,258,259,260,261,262
count,98000.0,98000.0,98000.0,98000.0,98000.0,98000.0,98000.0,98000.0,98000.0,98000.0,...,98000.0,98000.0,98000.0,98000.0,98000.0,98000.0,98000.0,98000.0,98000.0,98000.0
mean,0.513236,0.493491,0.489633,0.476817,0.527853,0.477096,0.55003,0.493985,0.514757,0.502128,...,0.00148,0.001296,0.000633,0.000337,0.000163,0.000133,8.2e-05,5.1e-05,1e-05,2e-05
std,0.108731,0.117351,0.112045,0.118953,0.116819,0.111033,0.112158,0.112651,0.113675,0.1203,...,0.038437,0.035976,0.025145,0.018347,0.012777,0.011517,0.009035,0.007143,0.003194,0.004518
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.439891,0.414154,0.414471,0.396933,0.448776,0.40236,0.473861,0.41784,0.438097,0.421016,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.513323,0.493616,0.489713,0.476522,0.527878,0.477389,0.550141,0.493999,0.514568,0.502264,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.586539,0.572312,0.565509,0.556815,0.607177,0.551999,0.625499,0.570135,0.591574,0.583126,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Clustering

In [43]:
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(data_norm)
cluster_range = range(2, 25)
db_index_scores = []
sil_scores = []

for k in cluster_range:
    model = KMeans(n_clusters=k).fit(X_train)
    res = model.predict(X_test)
    sil_score = silhouette_score(X_test, res, metric='cosine')
    db_index_score = davies_bouldin_score(X_test, res)
    sil_scores.append(sil_score)
    db_index_scores.append(db_index_score)
    print(f'Sil: {sil_score}, db: {db_index_score}, k: {k}')



Sil: 0.023245606133770837, db: 7.720677734853492, k: 2




Sil: 0.032949646991291485, db: 7.201494304312504, k: 3




Sil: 0.04496926596927746, db: 6.261272041519499, k: 4




Sil: 0.0666322406677773, db: 5.623551863012979, k: 5




Sil: 0.07447809378712658, db: 5.240924948160374, k: 6




Sil: 0.09012690333664736, db: 4.675535566552578, k: 7




Sil: 0.09972514347174288, db: 4.644439793838076, k: 8




Sil: 0.10882138968770795, db: 4.853593916892523, k: 9




Sil: 0.1169685732135741, db: 4.55356006470746, k: 10




Sil: 0.09636763429025642, db: 4.552832308092414, k: 11




Sil: 0.09004745491653589, db: 4.54222165948555, k: 12




Sil: 0.08619612175338981, db: 4.6704599582809125, k: 13




Sil: 0.08079919051346617, db: 4.484424830274853, k: 14




Sil: 0.07930857750164023, db: 4.399789636737025, k: 15




Sil: 0.07662995205331614, db: 4.428500015183609, k: 16




Sil: 0.06860271751589454, db: 4.567414328561959, k: 17




Sil: 0.06027964891292347, db: 4.4168125309707875, k: 18




Sil: 0.06322985285475462, db: 4.378129941278371, k: 19




Sil: 0.05977335021201827, db: 4.334864636333968, k: 20




Sil: 0.057273390731543056, db: 4.3214291631461625, k: 21




Sil: 0.053526350619805436, db: 4.26596471563242, k: 22




Sil: 0.04703000722221388, db: 4.545130593851255, k: 23




Sil: 0.045335135480828674, db: 4.951634165803399, k: 24


In [44]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.clf()
plt.plot(cluster_range, db_index_scores)
plt.plot(cluster_range, sil_scores)
plt.savefig('clusters_metrics.png')

# Cluster numbers worth trying: 7, 8, 10, 11, 12, 13, 15, 16, 18, 19, 20, 21, 22