# How to cluster the datasets?
According to some kernels like [here](https://www.kaggle.com/erikgarcia/data-is-an-hypersphere) the target transaction depends on the euclidean distance. I want to find out, if there are any significant cluster in the 200 dimensions.
Because of the size there are view clustering methods you can use. <br>
At the end of this kernel the new features are save and can be used for training a model.

## In this kernel I will use a efficient implementation of [DBSCAN](https://hdbscan.readthedocs.io/en/latest/basic_hdbscan.html). 

In [None]:
!pip install hdbscan

In [None]:
import hdbscan

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.cluster import SpectralClustering, AgglomerativeClustering, DBSCAN, KMeans, FeatureAgglomeration
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tqdm import tqdm
import gc
import time
gc.enable()
from numba import jit

In [None]:
%%time
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
X_test_ID = test.ID_code.values
y = train.target.values

cols = [i for i in test.columns if "var" in i]
train = train[cols+["target"]]
test = test[cols]
train.shape

In [None]:
df = pd.concat([train,test], axis=0)
scaler = StandardScaler()
df[cols] = scaler.fit_transform(df[cols])

# KMeans

In [None]:
%%time
sample_size = 10000

for sample_size in [df.shape[0]]:

    for m,n in enumerate([5,10,20,40]): # 
    #     print(f"\r {m}: clusters: {n}", flush=True, end="")

        n_clusters = n
        start = time.time()
        cluster = KMeans(n_clusters=n_clusters, init="k-means++", n_init=1, max_iter=200, tol=0.0001, 
                         precompute_distances="auto", verbose=0, random_state=None, copy_x=True, n_jobs=-1, algorithm="auto")

        cluster.fit(df.head(sample_size)[cols])
        end = time.time()
        print("{:.2f} Seconds | Clusters: {} | Sample size: {:.2f}% of total.".format(end-start, n_clusters, sample_size/df.shape[0]))
        y_pred = cluster.labels_

        train[f"kmeans_{m+1}"] = y_pred[:200000]
        test[f"kmeans_{m+1}"] = y_pred[200000:]
        del cluster
        gc.collect()

## KMeans just splits all the data in n cluster with almost the same size but the frequency of the target is in some clusters higher.

In [None]:
kmdf = train[["kmeans_3", "target"]].sort_values(by="target")
plt.figure(figsize=(12,8))
x = kmdf[kmdf.target==0].kmeans_3
y = kmdf[kmdf.target==1].kmeans_3
plt.hist([x,y], bins=60, label=["target = 0", "target = 1"]);
plt.legend(loc='upper right')
plt.show()

In [None]:
kmdf = train[train.kmeans_3==18]

sns.pairplot(kmdf[cols[:3]+["target"]], hue="target", vars=cols[:3]);
sns.pairplot(train[cols[:3]+["kmeans_1"]], hue="kmeans_1", vars=cols[:3]);

# Now GMM
## GMM works like KMeans but takes the density into account
The compution time increases with the number of clusters.

In [None]:
%%time
sample_size = 10000

for sample_size in [int(df.shape[0]*1)]:
    for m,n in enumerate([5,10,20]): # 

        n_clusters = n
        start = time.time()
        cluster = GaussianMixture(n_components=n_clusters, covariance_type="full", tol=0.001, reg_covar=1e-06, 
                                  max_iter=100, n_init=1, init_params="kmeans", weights_init=None, 
                                  means_init=None, precisions_init=None, random_state=None, 
                                  warm_start=False, verbose=0, verbose_interval=10)

        y_pred = cluster.fit_predict(df.head(sample_size)[cols])
        end = time.time()
        print("{:.2f} Seconds | Clusters: {} | Sample size: {:.2f}% of total.".format(end-start, n_clusters, sample_size/df.shape[0]))
#         y_pred = cluster.labels_

        train[f"gmm_{m+1}"] = y_pred[:200000]
        test[f"gmm_{m+1}"] = y_pred[200000:]
        del cluster
        gc.collect()

In [None]:
kmdf = train[["gmm_3", "target"]].sort_values(by="target")
plt.figure(figsize=(12,8))
x = kmdf[kmdf.target==0].gmm_3
y = kmdf[kmdf.target==1].gmm_3
plt.hist([x,y], bins=30, label=["target = 0", "target = 1"]);
plt.legend(loc='upper right')
plt.show()

In [None]:
kmdf = train[train.gmm_1==3]

sns.pairplot(kmdf[cols[:3]+["target"]], hue="target", vars=cols[:3]);
sns.pairplot(train[cols[:3]+["gmm_1"]], hue="gmm_1", vars=cols[:3]);

# FeatureAgglomeration and hdbscan

In [None]:
%%time
agglo = FeatureAgglomeration(affinity='euclidean', compute_full_tree='auto',
                               connectivity=None, linkage='complete', memory=None, n_clusters=4)
agglo.fit(df[cols])
reduced = agglo.transform(df[cols])

for m in range(reduced.shape[-1]):

    train[f"fagg_{m+1}"] = reduced[:200000, m]
    test[f"fagg_{m+1}"] = reduced[200000:, m]

In [None]:
fagg_cols = [i for i in train.columns if "fagg" in i]

sns.pairplot(train[fagg_cols+["target"]], hue="target", vars=fagg_cols);

In [None]:
train.to_pickle("train_scaled_clustered.pkl")
test.to_pickle("test_scaled_clustered.pkl")

## TODO hdbscan
DBSCAN needs more carefully selected parameters to get useful results

<font color=blue>You are welcome to take the code or the transformed datasets. If you find something interesting about the created features or you know a fast clustering method I should test, let me know.</font>

In [None]:
# def create_clusterer(alpha=0.5):
#     clusterer = hdbscan.HDBSCAN(algorithm='prims_kdtree', allow_single_cluster=False, alpha=alpha,
#                             approx_min_span_tree=True, cluster_selection_method='eom',
#                             core_dist_n_jobs=-1, gen_min_span_tree=False, leaf_size=40,
#                             match_reference_implementation=False, metric='euclidean', 
#                             min_cluster_size=7, min_samples=None, p=None,
#                             prediction_data=False)
#     return clusterer

In [None]:
# t = []
# sizes = [reduced.shape[0]] # 
# for alpha in [0.5]:

#     for sample_size in sizes:
#         clusterer = create_clusterer(alpha)
#         gc.collect()
#         start = time.time()
#         clusterer.fit(reduced[:sample_size])
#         end = time.time()
#         t.append(end-start)
#         dist = np.unique(clusterer.labels_, return_counts=True)
#         dist = dist[-1]/dist[-1].sum()
#         print("Sample Size: {} | Time: {:.2f} Seconds| N_Clusters: {} | alpha: {}".format(sample_size, (end-start), dist, alpha))
        
#         y_pred = clusterer.labels_
#         train[f"hdbscan_{m+1}"] = y_pred[:200000]
#         test[f"hdbscan_{m+1}"] = y_pred[200000:]
