In [1]:
import cv2
import pandas as pd
import numpy as np

def preprocessing(val=True):
    
    data = pd.read_csv('data/x_train_gr_smpl.csv')
    labels = pd.read_csv('data/y_train_smpl.csv')

    if(val):
        ## ---------------- Data preparation ---------------- ##
        X_train = []
        for i in range(data.shape[0]):
            img = np.uint8(data.iloc[i])
            edited = cv2.Canny(img, 10, 30)
            edited = cv2.GaussianBlur(edited, (5, 5), 0)
            X_train.append(edited.reshape((1,-1))[0])

        data = pd.DataFrame(X_train)
        ## -------------------------------------------------- ##

    return data, labels

In [2]:
from sklearn.cluster import KMeans
from sklearn.metrics import homogeneity_score

In [3]:
# NO labels NO preprocessing
data, labels = preprocessing(False)
labels = np.reshape(labels.values, newshape=(1, -1))

estimator = KMeans(n_clusters=10, max_iter=50000, init='random', random_state=1, n_jobs=-1).fit(data)
print(homogeneity_score(labels[0], estimator.labels_))

0.10029982093600802


In [4]:
# YES labels NO preprocessing
data, labels = preprocessing(False)
data['label'] = labels
data = data.sample(frac=1)

X_training = data.iloc[:10000]
X_test = data.iloc[10000:]
y_test = X_test['label']

estimator = KMeans(n_clusters=10, max_iter=50000, random_state=1, n_jobs=-1).fit(X_training)
e = estimator.predict(X_test)
print(homogeneity_score(y_test.values, e))

0.09738081664029646


In [5]:
# NO labels YES preprocessing
data, labels = preprocessing()
labels = np.reshape(labels.values, newshape=(1, -1))

estimator = KMeans(n_clusters=10, max_iter=50000, init='random', random_state=1, n_jobs=-1).fit(data)
print(homogeneity_score(labels[0], estimator.labels_))

0.4924864156309917


In [6]:
# YES labels YES preprocessing
data, labels = preprocessing()
data['label'] = labels
data = data.sample(frac=1)

X_training = data.iloc[:10000]
X_test = data.iloc[10000:]
y_test = X_test['label']

estimator = KMeans(n_clusters=10, max_iter=50000, random_state=1, n_jobs=-1).fit(X_training)
e = estimator.predict(X_test)
print(homogeneity_score(y_test.values, e))

0.5040984761555782


In [12]:
# Testing with the top 10 correlated features for each label. (NO labels YES preprocessing)

top_10_array = set()

data, _ = preprocessing()

for i in range(10):
    data = data.reindex(np.arange(data.shape[0]))
    labels = pd.read_csv(f'data/y_train_smpl_{i}.csv')
    data['label'] = labels
    data = data.sample(frac=1)
    
    corr_label = data.drop("label", axis=1).apply(lambda x: x.corr(data.label))
    corr_label = [(index, abs(corr_val), i) for index, corr_val in enumerate(corr_label)]
    corr_label = sorted(corr_label, key=lambda tup: tup[1], reverse=True)  # Order by correlation value
            
    for tup in corr_label[:10]:
        top_10_array.add(tup[0])
        
data, labels = preprocessing()
data_top_10 = data[data.columns[list(top_10_array)]].copy(deep=True)

labels = np.reshape(labels.values, newshape=(1, -1))
estimator = KMeans(n_clusters=10, max_iter=50000, init='random', random_state=1, n_jobs=-1).fit(data_top_10)
print(homogeneity_score(labels[0], estimator.labels_))

0.5795515321252357
