In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import squareform


In [2]:
# Load the data
features_raw = pd.read_csv('../dataset/x_train_T9QMMVq.csv',index_col=False).drop('ID',axis=1)
labels_raw = pd.read_csv('../dataset/y_train_R0MqWmu.csv').drop('ID', axis=1)

In [4]:
print(features_raw.shape)
print(labels_raw.shape)

(202933, 13)
(202933, 23)


## Seeking structure on the input

#### Study the input correlation matrix

In [5]:
corr_features = features_raw.corr()

In [None]:
f,ax = plt.subplots(1,2,figsize=(10,5))
sns.heatmap(corr_features,ax=ax[0])#, annot=True)
sns.heatmap(np.log10(corr_features),ax=ax[1])#, annot=True)
ax[0].set_title('Feature correlation')
ax[1].set_title('Feature log correlation')
f.tight_layout()

for subplot_ax in ax:
    for spine in subplot_ax.spines.values():
        spine.set_visible(True)

#f.savefig('new_corr_plots.png',bbox_inches='tight')

#### Analysis
It seems that there is block structure in the input, that may be useful to diagonalize to extract reduced dimensionality features.

We do a hierarchical clustering to be sure that the blocks that we see are the best blocks possible

In [7]:
features_subset = features_raw.sample(n=10000, random_state=12) #We take a random subset of the dataset to be able to do clustering
features_subset.shape
subset_correlation = features_subset.corr()
distance_matrix = np.ones(np.shape(subset_correlation))-subset_correlation

In [8]:
silhouette_values = []
range_nclusters = range(2,10)
features_subset_np = features_subset.to_numpy().T
for i in range_nclusters:
    hie_clusterer = AgglomerativeClustering(n_clusters=i, metric='precomputed',linkage='complete')
    labels = hie_clusterer.fit_predict(distance_matrix)
    sil = silhouette_score(features_subset_np, labels)
    print(labels)
    silhouette_values.append((i,sil))


silhouette_values = np.array(silhouette_values)

[0 1 1 1 1 0 0 0 0 1 1 1 1]
[2 0 0 0 0 1 1 1 1 0 0 0 0]
[2 3 3 3 3 1 1 1 1 0 0 0 0]
[2 3 3 3 3 1 1 1 1 0 0 0 4]
[5 3 3 3 3 0 0 0 0 1 1 2 4]
[5 3 3 3 3 1 1 2 2 0 0 6 4]
[5 1 1 1 1 0 0 2 2 7 3 6 4]
[5 0 0 0 0 8 4 2 2 7 3 6 1]


In [9]:
silhouette_values

array([[ 2.00000000e+00,  1.73405107e-01],
       [ 3.00000000e+00, -1.48040598e-01],
       [ 4.00000000e+00,  4.76326722e-02],
       [ 5.00000000e+00,  7.78797145e-03],
       [ 6.00000000e+00,  1.74120332e-03],
       [ 7.00000000e+00,  9.37109353e-02],
       [ 8.00000000e+00,  1.62788013e-01],
       [ 9.00000000e+00,  1.40394738e-01]])

In [None]:
plt.plot(silhouette_values[:,0],silhouette_values[:,1])

In [11]:
dist_squareform = squareform(distance_matrix)
linked = linkage(dist_squareform,method='complete')

In [None]:
f = plt.figure(figsize=(10, 8))
ax = f.add_axes((0.1,0.1,0.8,0.8))
dendrogram(linked,
            orientation='top',
            labels=subset_correlation.columns.to_list(),
            distance_sort='descending',
            show_leaf_counts=True,
            ax=ax)
ax.set_title('Dendrogram for feature clustering', fontsize=16)
ax.set_ylabel('Distance', fontsize=12)
ax.set_xlabel('Features', fontsize=12)
ax.grid(axis='y')
f.tight_layout()
#f.savefig('dendogram_hierarchical.png')


In [13]:
f_np = features_raw.to_numpy()

In [14]:
f_np

array([[ 9.81599658e-02, -1.75981481e-01, -8.64688889e-02, ...,
         1.01300691e+00,  1.00056313e+00,  9.99396645e-01],
       [ 3.07296818e-04, -6.64161682e-02,  3.60711506e-02, ...,
         9.96735437e-01,  1.00222580e+00,  1.01306334e+00],
       [ 3.88056025e-04,  1.90942530e-01,  1.87539757e-01, ...,
         9.05275348e-01,  9.53599555e-01,  9.86346505e-01],
       ...,
       [ 9.73470629e-02, -2.28548485e-02,  2.44727273e-04, ...,
         1.00406630e+00,  1.00093355e+00,  1.00078816e+00],
       [ 3.37148531e-04, -7.68181451e+00, -7.37704766e+00, ...,
         2.71207186e-01,  9.95494955e-01,  1.00985212e+00],
       [ 5.29399125e-01,  7.15875424e-01,  1.83415254e-02, ...,
         2.30007092e-01,  3.17572598e-01,  8.86747051e-01]],
      shape=(202933, 13))

In [20]:
from sklearn.model_selection import KFold

In [21]:
kfolder = KFold(n_splits=4)


In [36]:
x_data = 10+ np.arange(0,12)
y_data = x_data ** 2

In [41]:
for i,(train_idxs, test_idxs) in enumerate(kfolder.split(x_data)):
    print(i)
    print(train_idxs, test_idxs)
    print('train set:', x_data[train_idxs])
    print('test set:', x_data[test_idxs])

0
[ 3  4  5  6  7  8  9 10 11] [0 1 2]
train set: [13 14 15 16 17 18 19 20 21]
test set: [10 11 12]
1
[ 0  1  2  6  7  8  9 10 11] [3 4 5]
train set: [10 11 12 16 17 18 19 20 21]
test set: [13 14 15]
2
[ 0  1  2  3  4  5  9 10 11] [6 7 8]
train set: [10 11 12 13 14 15 19 20 21]
test set: [16 17 18]
3
[0 1 2 3 4 5 6 7 8] [ 9 10 11]
train set: [10 11 12 13 14 15 16 17 18]
test set: [19 20 21]


In [60]:
kfolder = KFold(n_splits=4)
kf_split = kfolder.split(y_data)

In [64]:
kf_split.__next__()

(array([0, 1, 2, 3, 4, 5, 6, 7, 8]), array([ 9, 10, 11]))

In [None]:
#playing with a possbile input format for the config

SEARCH_SPACE_CONFIG = {
    'lgbm': {
        'n_estimators': {'type': 'int', 'range': [100, 1000]},
        'learning_rate': {'type': 'float', 'range': [0.01, 0.3], 'kwargs': {'log': True}},
        'max_depth': {'type': 'int', 'range': [3, 10]},
        'num_leaves': {'type': 'int', 'range': [20, 300]}
    },
    'svr': {
        'kernel': {'type': 'categorical', 'choices': ['linear', 'rbf', 'poly']},
        'C': {'type': 'float', 'range': [1e-4, 1e4], 'kwargs': {'log': True}},
        'gamma': {'type': 'float', 'range': [1e-4, 1e2], 'kwargs': {'log': True}}
    }
}


In [21]:
SEARCH_SPACE_CONFIG['lgbm'].items()

dict_items([('n_estimators', {'type': 'int', 'range': [100, 1000]}), ('learning_rate', {'type': 'float', 'range': [0.01, 0.3], 'kwargs': {'log': True}}), ('max_depth', {'type': 'int', 'range': [3, 10]}), ('num_leaves', {'type': 'int', 'range': [20, 300]})])

In [16]:
a

dict_items([('n_estimators', {'type': 'int', 'range': [100, 1000]}), ('learning_rate', {'type': 'float', 'range': [0.01, 0.3], 'kwargs': {'log': True}}), ('max_depth', {'type': 'int', 'range': [3, 10]}), ('num_leaves', {'type': 'int', 'range': [20, 300]})])