# Clusterization methods
<h3> Plan </h3>

In [None]:
import os
import time
import warnings
import datetime
import numpy as np
import pandas as pd
import random
from sklearn.metrics import accuracy_score as acc
from sklearn.model_selection import cross_val_score

import pylab

from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap


from IPython.display import Image, SVG

%matplotlib inline


** Questions **
* What is clusterization?
* What main steps are in K-Means?

## k-means demo
<a href='https://www.naftaliharris.com/blog/visualizing-k-means-clustering/'> k-means </a>

## DBSCAN demo
<a href='https://www.naftaliharris.com/blog/visualizing-dbscan-clustering/'> DBSCAN </a>


---------
<h1 align="center"> K-Means </h1>

** k-means steps: **
 - 1. update clusters (reassign objects to clusters):
     ## $$ y_i := \arg\min\limits_{y\in Y} \rho (x_i ; \mu_y),~~i = 1,\dots,\ell;$$
 - 2: update clusters weights
##    $$ \mu_{yj} := \frac{\sum_{i=1}^\ell [y_i = y]\cdot f_j(x_i)} {\sum_{i=1}^\ell[y_i = y]}$$



In [None]:
class KMeans():
    def __init__(self, K, X=None, N=0):
        '''
        K - number of clusters
        X - dataset (if X is None then X is generated from gauss distribution)
        N - a number of samples to generate if X is None
        '''
        self.K = K
        if X is None:
            if N == 0:
                raise Exception("If no data is provided, \
                                 a parameter N (number of points) is needed")
            else:
                self.N = N
                self.X = self._init_board_gauss(N, K)
        else:
            self.X = X
            self.N = len(X)
            
        # initialization            
        self.mu = None # a list of centers of clusters
        self.clusters = None # labels of samples
        self.method = None # method for sampling initial centers of clusters
 
    def _init_board_gauss(self, N, k):
        '''
        N - a number of samples to generate
        k - a number of clusters
        '''
        n = float(N)/k
        X = []
        for i in range(k):
            c = (random.uniform(-1,1), random.uniform(-1,1))
            s = random.uniform(0.05,0.15)
            x = []
            while len(x) < n:
                a,b = np.array([np.random.normal(c[0],s),np.random.normal(c[1],s)])
                # Continue drawing points from the distribution in the range [-1,1]
                if abs(a) and abs(b)<1:
                    x.append([a,b])
            X.extend(x)
        X = np.array(X)[:N]
        return X
 
    def plot_board(self, fig_size=(10,7) ):
        '''
        '''
        X = self.X
        fig = plt.figure(figsize = fig_size)
        plt.xlim(-1,1)
        plt.ylim(-1,1)
        if self.mu and self.clusters:
            mu = self.mu
            clus = self.clusters
#             print(clus)
            K = self.K
            for m, clu in clus.items():
                cmap = plt.cm.get_cmap("Spectral")
                cs = cmap(1.*m/self.K)
                plt.plot(mu[m][0], mu[m][1], 'o', marker='*', \
                         markersize=20, color=cs)
#                 print(zip(clus[m]))
                plt.plot([x[0] for x in clus[m]], [x[1] for x in clus[m]], '.', \
                         markersize=8, color=cs, alpha=0.5)
        else:
            plt.plot(X[:,0], X[:,1], '.', alpha=0.5)
        if self.method == '++':
            tit = 'K-means++'
        else:
            tit = 'K-means with random initialization'
        pars = 'N=%s, K=%s' % (str(self.N), str(self.K))
        plt.title('\n'.join([pars, tit]), fontsize=16)
        plt.savefig('kpp_N%s_K%s.png' % (str(self.N), str(self.K)), \
                    bbox_inches='tight', dpi=200)
 

    def _reevaluate_centers(self):
        '''
        Maximization step in Kmeans
        '''
        clusters = self.clusters
        newmu = []
        keys = sorted(self.clusters.keys())
        for k in keys:
            newmu.append(np.mean(clusters[k], axis = 0))
        self.mu = newmu
        
    def _cluster_points(self):
        '''
        expectation step in Kmeans 
        '''
        mu = self.mu
        clusters  = {}
        for x in self.X:
            bestmukey = # YOUR CODE!!!For x find index of the nearest cluster for it (use np.linalg.norm)!!!
            try:
                clusters[bestmukey].append(# YOUR CODE!!!Add x to the find cluster's list of elements!!!)
            except KeyError:
                clusters[bestmukey] = [x]
                                           
        if len(clusters) < self.K:
            for k in range(len(self.K)):
                if k not in clusters.keys():
                    clusters[k] = mu[k] # Not update cluster
                                           
        self.clusters = clusters
 

    def find_centers(self, method='random'):
        self.method = method
        X = self.X
        K = self.K
        #print(X)
        self.oldmu = random.sample(list(X), K)
        if method != '++':
            # Initialize to K random centers
            self.mu = random.sample(list(X), K)
        while not self._has_converged():
            self.oldmu = self.mu # remember previous cluster centers
            # Assign all points in X to clusters
            self._cluster_points()
            # Reevaluate centers
            self._reevaluate_centers()
                                           
    def _has_converged(self):
        '''
        condition of convergence of cluster points
        '''
        K = len(self.oldmu)
        return(# YOUR CODE!!!Check that our optimization has converged (use self.oldmu)!!!  
            and len(set([tuple(a) for a in self.mu])) == K)


In [None]:
kmeans = KMeans(K=3, N=200)
kmeans.find_centers()
kmeans.plot_board(fig_size=(10,7))

In [None]:
kmeans.find_centers()
kmeans.plot_board(fig_size=(15,4))

### Improve to Kmeans++

In [None]:
class KPlusPlus(KMeans):
    def _dist_from_centers(self):
        cent = self.mu
        X = self.X
        D2 = np.array([min([np.linalg.norm(x-c)**2 for c in cent]) for x in X])
        self.D2 = D2
 
    def _choose_next_center(self):
        self.probs = self.D2/self.D2.sum()
        self.cumprobs = self.probs.cumsum()
        r = (random.random()+1.0)/2
        ind = np.where(self.cumprobs >= r)[0][0]
        return(self.X[ind])
 
    def init_centers(self):
        self.mu = random.sample(list(self.X), 1)
        while len(self.mu) < self.K:
            self._dist_from_centers()
            self.mu.append(self._choose_next_center())
 
    def plot_init_centers(self, fig_size = (10,7)):
        X = self.X
        fig = plt.figure(figsize=(10,5))
        plt.xlim(-1,1)
        plt.ylim(-1,1)
        plt.plot(X[:,0], X[:,1], '.', alpha=0.5)
        plt.plot([x[0] for x in self.mu], [x[1] for x in self.mu], 'ro')
        plt.savefig('kpp_init_N%s_K%s.png' % (str(self.N),str(self.K)), \
                    bbox_inches='tight', dpi=200)

In [None]:
kplusplus = KPlusPlus(K=5, N=400)
kplusplus.plot_board(fig_size = (15,5))

In [None]:
# Random initialization
kplusplus.find_centers(method='random')
kplusplus.plot_board(fig_size = (15,3))
# k-means++ initialization
kplusplus.init_centers()
kplusplus.plot_init_centers(fig_size=(10,3))
kplusplus.find_centers(method='++')
kplusplus.plot_board(fig_size = (15,3))

-------
<h1 align="center">Text clusterization</h1> 

## Sample

In [None]:
from sklearn.datasets import fetch_20newsgroups

In [None]:
train_all = fetch_20newsgroups(subset='train')
print (train_all.target_names)

In [None]:
simple_dataset = fetch_20newsgroups(
    subset='train', 
    categories=['comp.sys.mac.hardware', 'soc.religion.christian', 'rec.sport.hockey'])

In [None]:
print (simple_dataset.data[0])

In [None]:
print (simple_dataset.data[-1])

In [None]:
print (simple_dataset.data[-2])

In [None]:
print (len(simple_dataset.data))

### Extract features from text

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=500, min_df=10)
matrix = vectorizer.fit_transform(simple_dataset.data)
matrix.shape

## AgglomerativeClustering, Neighbour joining

In [None]:
from sklearn.cluster.hierarchical import AgglomerativeClustering

model = AgglomerativeClustering(n_clusters=3, affinity='cosine', linkage='complete')
preds = model.fit_predict(matrix.toarray())

In [None]:
print(list(preds)[:5])

In [None]:
print(matrix[0])

In [None]:
vectorizer.get_feature_names()

In [None]:
vectorizer.get_feature_names()[877]

In [None]:
simple_dataset.data[0]

In [None]:
simple_dataset.target

In [None]:
preds

In [None]:
# Assessement
mapping = {2 : 1, 1: 2, 0: 0}
mapped_preds = [mapping[pred] for pred in preds]
# print (float(sum(mapped_preds != simple_dataset.target)) / len(simple_dataset.target))
print(acc(mapped_preds, simple_dataset.target))

In [None]:
import itertools
def validate_with_mappings(preds, target):
    permutations = itertools.permutations([0, 1, 2])
    for a, b, c in permutations:
        mapping = {2 : a, 1: b, 0: c}
        mapped_preds = [mapping[pred] for pred in preds]
#         print (float(sum(mapped_preds != target)) / len(target))
        print(acc(mapped_preds, target))
validate_with_mappings(preds, simple_dataset.target)

## KMeans

In [None]:
from sklearn.cluster import KMeans

model = KMeans(n_clusters=3, random_state=1)
preds = model.fit_predict(matrix.toarray())
print (preds)
print (simple_dataset.target)
validate_with_mappings(preds, simple_dataset.target)

In [None]:
# Compare with Linear Regression
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
print (cross_val_score(clf, matrix, simple_dataset.target).mean())

** Вопрос: ** Very big quality of K-Means, nearly as supervised algorithm, why?

## More complex dataset

In [None]:
noteasy_dataset = fetch_20newsgroups(
    subset='train', 
    categories=['comp.sys.mac.hardware', 'comp.os.ms-windows.misc', 'comp.graphics'])
matrix = vectorizer.fit_transform(noteasy_dataset.data)

In [None]:
model = KMeans(n_clusters=3, random_state=1)
preds = model.fit_predict(matrix.toarray())
print (preds)
print (noteasy_dataset.target)
validate_with_mappings(preds, noteasy_dataset.target)

In [None]:
clf = LogisticRegression()
print (cross_val_score(clf, matrix, noteasy_dataset.target).mean())

## SVD + KMeans

In [None]:
from sklearn.decomposition import TruncatedSVD

model = KMeans(n_clusters=3, random_state=42)
svd = TruncatedSVD(n_components=1000, random_state=123)
features = svd.fit_transform(matrix)
preds = model.fit_predict(features)
validate_with_mappings(preds, noteasy_dataset.target)

In [None]:
model = KMeans(n_clusters=3, random_state=42)
svd = TruncatedSVD(n_components=200, random_state=321)
features = svd.fit_transform(matrix)
preds = model.fit_predict(features)
validate_with_mappings(preds, noteasy_dataset.target)

# Quality of clusterization
--------

### Homogeneity:  each cluster contains only members of a single class

### Completeness: all members of a given class are assigned to the same cluster

### V-measure:
### $$v = 2 \cdot \frac{(homogeneity \cdot completeness)}{ (homogeneity + completeness)}$$

In [None]:
from sklearn.metrics.cluster import homogeneity_score, completeness_score,v_measure_score 

print(completeness_score(noteasy_dataset.target, preds))
print(homogeneity_score(noteasy_dataset.target, preds))
print(v_measure_score(noteasy_dataset.target, preds))

### Results

1. Good results for both text datasets
2. On easy data clusterization methods work well