## Clustering each segment using metafeatures in VSM

In [1]:
from __future__ import division
%matplotlib inline
%pylab inline
import csv
import sys
import os
import traceback
import json
import pickle
import numpy as np
import pandas as pd
import scipy.io as sio
import seaborn as sns
import itertools
import scipy.fftpack as fft
from scipy.stats import norm, skew, kurtosis
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt
from collections import defaultdict, Counter, OrderedDict
from helper_functions import getListOfFiles, getCSV, getStatistics, remap_interval
from operator import itemgetter

Populating the interactive namespace from numpy and matplotlib


## Auxiliary functions

In [2]:
class MetafeatureVector():
    """Upon call to store, organize the data in 
    a dictionary. Convenient for using with csv.DictWriter."""
    def __init__(self):
        self.rows = defaultdict(dict)

    def store(self,d,func):
        """Each new value of a function is stored in a list
        for the given attibute."""
        for k,v in d.items():
            self.rows[k][func] = float("{:.2f}".format(v))
        
    def getData(self):
        """Return dictionary where each key is the atribute
        and its value a dictionary containing the metafeatures
        calculated"""
        return self.rows

In [3]:
def flatten_dict(dd, separator='.', prefix=''):
    """This function collapses a dictionary into a list, by appending
    the keys' values to themselves. That is, parents(keys) are joined together
    with children (values) by the separator variable.
    dd  :   dictionary to be flattened
    separator   :   the character used to join values to their keys.
    prefix      :   the character used in place of the value.
    """
    return {prefix + separator + k if prefix else k: v
            for kk, vv in dd.items()
            for k, v in flatten_dict(vv, separator, kk).items()
            } if isinstance(dd, dict) else {prefix: dd}

In [4]:
def find_subsequence(seq, subseq):
    target = np.dot(subseq, subseq)
    candidates = np.where(np.correlate(seq,
                                       subseq, mode='valid') == target)[0]
    # some of the candidates entries may be false positives, double check
    check = candidates[:, np.newaxis] + np.arange(len(subseq))
    mask = np.all((np.take(seq, check) == subseq), axis=-1)
    return candidates[mask]

## Meta-features function definition

In [5]:
def mean(data):
    """Calculates mean of the data"""
    return np.mean(data)

def std(data):
    """Calculates the standard deviation"""
    return np.std(data)


def max_value(data):
    """ Calculates Largest value in array"""
    return np.max(data)
    
def min_value(data):
    """Calculates smallest value in array"""
    return np.min(data)

def mad(data, axis=None):
    """ 
    Calculates the median absolute deviation: a "Robust" version of standard deviation.
        Indices variabililty of the sample.
        https://en.wikipedia.org/wiki/Median_absolute_deviation 
    """
    return np.median(np.absolute(data - np.median(data, axis)), axis)


def sma(data):
    """Computes Signal magnitude area.
    http://dsp.stackexchange.com/questions/18649/signal-magnitude-area
    """
    accumulator = np.sum([np.abs(x) for x in data])
    return accumulator / float(len(data))


def energy(data):
    """Energy measure. Sum of the squares divided by the number of values."""
    return np.sum([x**2 for x in data]) / float(len(data))


def iqr(data):
    """Calculates the interquartile range
    http://stackoverflow.com/questions/23228244/how-do-you-find-the-iqr-in-numpy
    """
    return np.subtract(*np.percentile(data, [75, 25]))

def maxInds(data, n_bins=200):
    """Returns the index of the frequency component with largest magnitude"""
    
    mean_sig = np.ones_like(data)*np.mean(data)
    # remove mean of the signal, for better results.
    sig = data - mean_sig
    freqsig = fft.fft(sig,n=n_bins) 
    half_freq_domain = freqsig[:int(n_bins/2)]
    #get max index in the freq domain
    return np.where(np.abs(half_freq_domain)==(max(np.abs(half_freq_domain))))[0][0]

def meanFreq(data, n_bins=200):
    """
    Weighted average of the frequency components to obtain a mean frequency
    http://luscinia.sourceforge.net/page26/page35/page35.html
    """
    mean_sig = np.ones_like(data)*np.mean(data)
    # remove mean of the signal, for better results.
    sig = data - mean_sig
    freqsig = fft.fft(sig,n=n_bins) 
    half_freq_domain = freqsig[:int(n_bins/2)]
    return np.sum(np.abs(half_freq_domain) * range(len(half_freq_domain))) / sum(np.abs(half_freq_domain))

def skewness(data):
    return skew(data)

def kurtos(data):
    return kurtosis(data)

def freq_skewness(data, n_bins=200): 
    """skewness of the frequency domain signal"""
    mean_sig = np.ones_like(data)*np.mean(data)
    # remove mean of the signal, for better results.
    sig = data - mean_sig
    freqsig = fft.fft(sig,n=n_bins) 
    half_freq_domain = freqsig[:int(n_bins/2)]
    return skew(np.abs(half_freq_domain))

def freq_kurtos(data, n_bins=200):
    """kurtosis of the frequency domain signal"""
    mean_sig = np.ones_like(data)*np.mean(data)
    # remove mean of the signal, for better results.
    sig = data - mean_sig
    freqsig = fft.fft(sig,n=n_bins) 
    half_freq_domain = freqsig[:int(n_bins/2)]
    return kurtosis(np.abs(half_freq_domain))

In [6]:
function_dispatcher = {
    "mean"     : mean,
    "std"      : std,
    "max"      : max_value,
    "min"      : min_value,
    "mad"      : mad,
    "sma"      : sma,
    "iqr"      : iqr,
    "energy"   : energy,
    "maxInds"  : maxInds,
    "meanFreq" : meanFreq,
    "skewness" : skewness,
    "kurtosis" : kurtos,
    "freq_skewness" : freq_skewness,
    "freq_kurtosis" : freq_kurtos
}

In [7]:
def get_metafeat_vector(segment, mf=["mean","std","max","min","mad","sma",
                                        "iqr","energy","maxInds","meanFreq","skewness","kurtosis"]):
    """
        Compute metafeatures from segment data.
    
        segment : the time series segment
        mf      : list of metafeatures functions to be computed on the segment data.
    """
    
    meta_vector = OrderedDict()

    for f in mf:
        try:
            func = function_dispatcher[f]          # retrieve function
            meta_vector[f] = func(segment)    # compute function on segment.
        except KeyError:
            raise ValueError('Invalid function: {}'.format(f))

    return meta_vector
    

### Load symbolized segments

In [30]:
mat_dir = './z'

In [31]:
sym_dir = os.path.join(mat_dir, 'symbolization')
files = getListOfFiles(sym_dir, ".mat")
print ">> {} mat Files found!".format(len(files))

>> 29 mat Files found!


In [32]:
sym_segments = []
sym_file_map = []
count = 0
min_length = float('inf')
for f in files:
    mat_content = sio.loadmat(os.path.join(sym_dir, f))
    #print mat_content
    data = mat_content['sym']
    #print data.tolist()[0]
    for d in data.tolist()[0]:
        if d[0].size == 0:
            pass
        else:
            count += 1
            sample = [str(i[0]) for i in d.tolist()]
            if len(sample) < min_length:
                min_length = len(sample)
            sample = ' '.join(map(str, sample)) # separate caracters by space
            sym_segments.append(sample)
            sym_file_map.append('.'.join(f.split('.')[0:2]))
            

assert (len(sym_segments) == count)
print 'N. of segments {}'.format(count)
print 'Min. sample size of {}'.format(min_length)

N. of segments 623
Min. sample size of 50


## Load original recomposed signal

In [158]:
original_files = getListOfFiles(mat_dir, ".mat")
print ">> {} mat Files found!".format(len(files))
originalX = {}
originalY = {}
originalZ = {}
for f in original_files:
    mat_content = sio.loadmat(os.path.join(mat_dir, f))
    originalX[f] = mat_content['x_axis'].tolist()[0]
    originalY[f] = mat_content['y_axis'].tolist()[0]
    originalZ[f] = mat_content['z_axis'].tolist()[0]

original = originalZ 
print '>> N. recovered signals {}'.format(len(originalX))

>> 29 mat Files found!
>> N. recovered signals 29


## Load signal segments files

In [159]:
seg_dir = os.path.join(mat_dir, 'segments')
signal_files = getListOfFiles(seg_dir, ".mat")
print ">> {} mat Files found!".format(len(files))
original_files = []
for f in files:
    original_files.append('.'.join(f.split('.')[0:2]))

>> 29 mat Files found!


## Realign segments

In [160]:
sig_segments = []
sig_file_map = []
succ_alignments = []
segment_fft = []
min_length = float('inf')
N_fft = 50
for f in files:
    mat_content = sio.loadmat(os.path.join(seg_dir, f))
    data = mat_content['seg']
    data = data.tolist()[0]
    file_segments = []
    for d in data:
        if d[0].size == 0:
            pass
        else:
            sample = [i[0] for i in d.tolist()]
            file_segments.append(sample)
            sample_fft = fft.fft(sample, n=N_fft)
            segment_fft.append(np.abs(sample_fft[:len(sample_fft)/2]))
            if len(sample) < min_length:
                min_length = len(sample)
    
    ### aligning ###
    alignment = []
    for i,s in enumerate(file_segments):
        alignment.append((i,find_subsequence(original[f],s)[0]))
    print alignment
    seq_align_indexes = [x[1] for x in alignment]
    print seq_align_indexes
    alignment.sort(key=lambda tup: tup[1])
    print 
    print alignment
    ### testing the realignment ###
    print '-- Checking alignment for sequence...'
    restored = [] 
    for l,i in alignment:
        restored += file_segments[l]
    try:
        assert original[f] == restored
        print '$$ Segments ALIGNED!'
    except AssertionError:
        print '%%%%% File {} FAILED to assert equality for aligned sequence %%%%%'.format(f)
        break
    
    ### saving ###
    for s in range(len(file_segments)):
        sig_segments.append(file_segments[s])
        sig_file_map.append('.'.join(f.split('.')[0:2]))
        succ_alignments.append(seq_align_indexes[s])
    
    
   
    assert len(sig_segments) == len(sig_file_map) == len(succ_alignments)
print 'Files list number: {}'.format(len(sig_segments))
print 'Min. sample size of {}'.format(min_length)

[(0, 863), (1, 1092), (2, 618), (3, 736), (4, 1662), (5, 1591), (6, 1186), (7, 0), (8, 133), (9, 195), (10, 1385), (11, 1454), (12, 1526), (13, 1276), (14, 1332), (15, 268), (16, 358)]
[863, 1092, 618, 736, 1662, 1591, 1186, 0, 133, 195, 1385, 1454, 1526, 1276, 1332, 268, 358]

[(7, 0), (8, 133), (9, 195), (15, 268), (16, 358), (2, 618), (3, 736), (0, 863), (1, 1092), (6, 1186), (13, 1276), (14, 1332), (10, 1385), (11, 1454), (12, 1526), (5, 1591), (4, 1662)]
-- Checking alignment for sequence...
$$ Segments ALIGNED!
[(0, 1189), (1, 1250), (2, 153), (3, 714), (4, 1637), (5, 1510), (6, 1940), (7, 2033), (8, 291), (9, 391), (10, 495), (11, 0), (12, 77), (13, 1342), (14, 1421), (15, 549), (16, 664), (17, 1127), (18, 800), (19, 903)]
[1189, 1250, 153, 714, 1637, 1510, 1940, 2033, 291, 391, 495, 0, 77, 1342, 1421, 549, 664, 1127, 800, 903]

[(11, 0), (12, 77), (2, 153), (8, 291), (9, 391), (10, 495), (15, 549), (16, 664), (3, 714), (18, 800), (19, 903), (17, 1127), (0, 1189), (1, 1250), (13



In [174]:
print succ_alignments[-1]

2137


### Plot signals

In [161]:
def gridOfPlots(data, suptitle="Grid of plot"):
    """Plots the data in a grid of plots.
    Args:
        data (list): the list of data to be used.
        title (str): grid title.
        columnToPlot: the column in the data to be plotted.
    """
    
    grid_side_size = int(round(np.sqrt(len(data))))
    fig, axes = plt.subplots(grid_side_size, grid_side_size, figsize=(18,12))

    count = 0
    for i, row in enumerate(axes):
        for j in range(grid_side_size):
            if count >= len(data):
                fig.delaxes(row[j])
            else:
                row[j].set_title("Plot {}".format(count), fontsize=8, fontweight="bold")
                row[j].set_ylabel('g\'s (9.8 m/s^2)')
                row[j].plot(data[count])
                row[j].grid()
                count += 1

    fig.suptitle(suptitle, fontsize=21)
    fig.subplots_adjust(hspace=0.5, wspace= 0.4)
    plt.draw()

In [162]:
#gridOfPlots(sig_segments[:50])
#gridOfFFT(sig_segments[:50], 50)

## Vectorize segments using metafeatures

In [163]:
## metafeatures list to be computed
#to_compute = ["mean","std","max","min","mad","sma","iqr","maxInds","skewness","kurtosis","freq_skewness","freq_kurtosis"]
#to_compute = ["mean","std","max","min","mad","sma","iqr","skewness","kurtosis"]
to_compute = ["sma", "energy",'iqr']
meta_vectors = []
for s in sig_segments:
    meta_vectors.append(get_metafeat_vector(s, mf=to_compute).values())
#X = np.matrix(meta_vectors)
#X = np.matrix([i + [j] for i,j in zip(segment_fft,[max_value(x)-min_value(x) for x in sig_segments])])
#max_min = np.array([max_value(x)-min_value(x) for x in sig_segments])
#A = np.column_stack((np.array(segment_fft), max_min))
#A = np.column_stack((np.array(A), meta_vectors))
#A = np.column_stack((np.array(segment_fft), meta_vectors))
X=np.matrix(meta_vectors)
#X = np.matrix(A)
print X
print X.shape

[[ 0.12731454  0.02404791  0.1161499 ]
 [ 0.24158104  0.0800407   0.36096191]
 [ 0.22977978  0.07670215  0.29614258]
 ..., 
 [ 0.02743816  0.00082736  0.00915527]
 [ 0.07537607  0.00800643  0.1259613 ]
 [ 0.05595584  0.0049847   0.0748291 ]]
(623, 3)


#### Standard score normalization or Z-normalization:

In [164]:
X_normed = (X - X.mean(axis=0)) / X.std(axis=0)
#print X_normed
X = X_normed

## Perform the Hierarchical Clustering

In [166]:
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist

Z = None
# Check the Cophenetic Correlation Coefficient of the clustering 
# with help of the cophenet() function. This (very very briefly)
# compares (correlates) the actual pairwise distances of all samples 
# to those implied by the hierarchical clustering. The closer the value
# is to 1, the better the clustering preserves the original distances.
for t in ['ward','single', 'complete', 'average']:
    # generate the linkage matrix
    Z = linkage(X, t)
    c, coph_dists = cophenet(Z, pdist(X, 'cosine'))
    print 'Linkage {} has a Cophenetic Correlation Coefficient of {}'.format(t,c)
Z = linkage(X,'ward')

Linkage ward has a Cophenetic Correlation Coefficient of 0.816585932022
Linkage single has a Cophenetic Correlation Coefficient of 0.422826185228
Linkage complete has a Cophenetic Correlation Coefficient of 0.536013708854
Linkage average has a Cophenetic Correlation Coefficient of 0.65008162575


In [167]:
def fancy_dendrogram(*args, **kwargs):
    max_d = kwargs.pop('max_d', None)
    if max_d and 'color_threshold' not in kwargs:
        kwargs['color_threshold'] = max_d
    annotate_above = kwargs.pop('annotate_above', 0)

    ddata = dendrogram(*args, **kwargs)

    if not kwargs.get('no_plot', False):
        plt.title('Hierarchical Clustering Dendrogram (FFT[200Fs,50bins],"mean","std","max","min","mad","sma","iqr","skewness","kurtosis")')
        plt.xlabel('sample index or (cluster size)')
        plt.ylabel('distance')
        for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
            x = 0.5 * sum(i[1:3])
            y = d[1]
            if y > annotate_above:
                plt.plot(x, y, 'o', c=c)
                plt.annotate("%.3g" % y, (x, y), xytext=(0, -5),
                             textcoords='offset points',
                             va='top', ha='center')
        if max_d:
            plt.axhline(y=max_d, c='k')
    return ddata

In [168]:
fancy_dendrogram(
    Z,
    truncate_mode='lastp',
    p=12,
    leaf_rotation=90.,
    leaf_font_size=12.,
    show_contracted=True,
    annotate_above=10,  # useful in small plots so annotations don't overlap
)
plt.show()

## Get clusters by distance

In [169]:
from scipy.cluster.hierarchy import fcluster
max_d = 6
clusters = fcluster(Z, max_d, criterion='distance')
print Counter(clusters)

Counter({1: 209, 3: 163, 7: 83, 2: 53, 5: 52, 6: 48, 4: 15})


## Get clusters by max number

In [170]:
#from scipy.cluster.hierarchy import fcluster
#k=6
#result = fcluster(Z, k, criterion='maxclust')
#print Counter(result)

## Framming corpus

In [171]:
clusters = clusters.tolist()
ranks = [i for i in range(len(sig_segments))]

df = {'sym_segments':sym_segments,
      'cluster':clusters,
      'rank':ranks,
      'sig_segments':sig_segments,
      'sig_file_map':sig_file_map,
      'sym_file_map':sym_file_map,
      'alignment': succ_alignments
     }

frame = pd.DataFrame(df, index = [clusters] , columns = ['sym_segments','cluster','rank','sig_segments',
                                                         'sig_file_map','sym_file_map', 'alignment'])
assert frame['sig_file_map'].equals(frame['sym_file_map'])
print 'Cluster counts: '
frame['cluster'].value_counts()

Cluster counts: 


1    209
3    163
7     83
2     53
5     52
6     48
4     15
Name: cluster, dtype: int64

In [172]:
%matplotlib notebook
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure()
ax = Axes3D(fig)

ax.scatter(X[:,0].tolist(), X[:,1].tolist(), X[:,2].tolist(), c=clusters, cmap='prism')  # plot points with cluster dependent colors
plt.show()



## Save to pickle

In [151]:
# open the file for writing
file_name = 'cluster_info_matrix.pkl'
file_object = open(file_name,'wb')
pickle.dump(frame,file_object)
file_object.close()
print 'pickle DONE!'

pickle DONE!


In [95]:
grouped = frame['rank'].groupby(frame['cluster'])
grouped.mean()

cluster
1    404.750000
2    193.550000
3    266.087912
4    328.452830
5    313.370787
6    351.142857
7    447.866667
8    323.500000
9    312.000000
Name: rank, dtype: float64

## Multidimensional scaling

In [57]:
from sklearn.manifold import MDS

MDS()
# two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(dist)  # shape (n_components, n_samples)
xs, ys = pos[:, 0], pos[:, 1]

TypeError: float() argument must be a string or a number

## Visualizing document clusters

In [None]:
#set up colors per clusters using a dict
cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a', 4: '#66a61e'}

#set up cluster names using a dict
cluster_names = {0: 'Cluster 1', 
                 1: 'Cluster 2', 
                 2: 'Cluster 3', 
                 3: 'Cluster 4', 
                 4: 'Cluster 5'}

In [None]:
#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=range(len(sym_segments)))) 

#group by cluster
groups = df.groupby('label')


# set up plot
fig, ax = plt.subplots(figsize=(17, 9)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, label=cluster_names[name], color=cluster_colors[name], mec='none')
    ax.set_aspect('auto')
    ax.tick_params(\
        axis= 'x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelbottom='off')
    ax.tick_params(\
        axis= 'y',         # changes apply to the y-axis
        which='both',      # both major and minor ticks are affected
        left='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelleft='off')
    
ax.legend(numpoints=1)  #show legend with only 1 point

#add label in x,y position with the label as the film title
for i in range(len(df)):
    ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=8)  

    
    
plt.show() #show the plot

#uncomment the below to save the plot if need be
#plt.savefig('clusters_small_noaxes.png', dpi=200)

## Save to pickle

In [None]:
# open the file for writing
file_name = 'cluster_info_matrix.pkl'
file_object = open(file_name,'wb')
pickle.dump(frame,file_object)
file_object.close()
print 'pickle DONE!'

## Cluster's members

In [None]:
grid_side_size = 5
fig, axes = plt.subplots(4, grid_side_size, figsize=(20,9))

first_50 = defaultdict(list)
for ind,c in enumerate(clusters):
    axes[0][c].plot(sig_segments[ind])
    axes[0][c].set_title('Cluster {}'.format(c))
    first_50[c].append(sig_segments[ind][:50])
    
for c,l in first_50.iteritems():
    sig = np.matrix(l)
    mean_of_sig = np.mean(sig,axis=0).tolist()[0]
    std_of_sig  = np.std(sig,axis=0).tolist()[0]
    axes[1][c].errorbar(range(len(mean_of_sig)),mean_of_sig,yerr=std_of_sig, fmt='-o', ecolor='r')
    axes[1][c].set_title('Mean of first 50 pts - Cluster {}'.format(c))
    axes[1][c].plot(mean_of_sig, c='m')
    
N_fft = 100 # FFT number of bins
Fs = 100 # Frequence range we are interested

for c,l in first_50.iteritems():
    # getting the signal (the mean value of each 50-length segment in the cluster). Assumed to be the
    # mean of the cluster.
    sig = np.mean(np.matrix(l),axis=0).tolist()[0]
    mean_sig = np.ones_like(sig)*np.mean(sig)
    # remove mean of the signal, for better results.
    sig = sig - mean_sig
    ### FFT
    freqsig = fft.fft(sig, n=N_fft)
    freq_axis = np.arange(0, Fs, Fs / N_fft)
    axes[2][c].plot(freq_axis, np.abs(freqsig), lw=2.0, c='b')
    p = plt.Rectangle((Fs/2, 0), Fs/2, ax.get_ylim()[1], facecolor="grey", fill=True, alpha=0.75, hatch="/", zorder=3)
    axes[2][c].add_patch(p)
    axes[2][c].set_xlim((-2,Fs))
    axes[2][c].set_xlim((-2,Fs))
    axes[2][c].set_title("FFT - Cluster{}".format(c), fontsize=10)
    axes[2][c].set_ylabel('Power')
    axes[2][c].set_xlabel('Frequency (Hz)')
    axes[2][c].legend((p,), ('excluded',))
    axes[2][c].grid()

for cc in range(len(cluster_centers)):
    axes[3][cc].plot(cluster_centers[cc])
    axes[3][cc].set_title('Centroid of cluster {}'.format(cc))
    
fig.tight_layout()
fig.subplots_adjust(hspace=0.8, wspace=0.5)
plt.draw()

## Hierarchical document clustering

Now that I was successfuly able to cluster and plot the documents using k-means, I wanted to try another clustering algorithm. I chose the [Ward clustering algorithm](http://en.wikipedia.org/wiki/Ward%27s_method) because it offers hierarchical clustering. Ward clustering is an agglomerative clustering method, meaning that at each stage, the pair of clusters with minimum between-cluster distance are merged. I used the precomputed cosine distance matrix (*dist*) to calculate a linkage_matrix, which I then plot as a dendrogram. 

In [None]:
from scipy.cluster.hierarchy import ward, dendrogram

linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances

fig, ax = plt.subplots(figsize=(12, 20)) # set size
ax = dendrogram(linkage_matrix, orientation="right", labels=range(len(segments)))

plt.tick_params(\
    axis= 'x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off')

plt.tight_layout() #show plot with tight layout

#uncomment below to save figure
plt.savefig('ward_clusters.png', dpi=200) #save figure as ward_clusters

## T-sne

In [None]:
%%time

from sklearn.decomposition import TruncatedSVD 
from sklearn.preprocessing import normalize 

ncomps = 10
svd = TruncatedSVD(n_components=ncomps)
svd_fit = svd.fit(smatrix.todense())
Y = svd.fit_transform(smatrix.todense()) 
ax = pd.Series(svd_fit.explained_variance_ratio_.cumsum()).plot(kind='line', figsize=(10,3)).set_ylim([0,1.1])
print('Variance preserved by first 50 components == {:.2%}'.format(
        svd_fit.explained_variance_ratio_.cumsum()[-1]))

In [None]:
dfsvd = pd.DataFrame(Y, columns=['c{}'.format(c) for c in range(ncomps)], index=frame.index)
print(dfsvd.shape)
dfsvd.head()

In [None]:
plotdims = 5
ploteorows = 1
svdcols = [c for c in dfsvd.columns if c[0] == 'c']
dfsvdplot = dfsvd[svdcols].iloc[:,:plotdims]
dfsvdplot['cluster'] = frame['cluster']
ax = sns.pairplot(dfsvdplot.iloc[::ploteorows,:], hue='cluster', size=2.5)


In [None]:
Z = TSNE().fit_transform(dfsvd[svdcols])
dftsne = pd.DataFrame(Z, columns=['x','y'], index=dfsvd.index)
ax = sns.lmplot('x', 'y', dftsne, fit_reg=False, size=8
                ,scatter_kws={'alpha':0.7,'s':60})

In [None]:
dftsne['cluster'] = frame['cluster']
g = sns.lmplot('x', 'y', dftsne, hue='cluster', fit_reg=False, size=8
                ,scatter_kws={'alpha':0.7,'s':60})
g.axes.flat[0].set_title('Scatterplot of a 20D dataset reduced to 2D using t-SNE')

## With DBSCAN

In [None]:
from sklearn.cluster import dbscan
db_a = dbscan(tfidf_matrix,eps=0.60, min_samples=3)
clusters = db_a[1].tolist()
ranks = [i for i in range(len(segments))]
ndf = {'indexes' : range(len(segments)), 'segments': segments,'cluster': clusters, 'rank': ranks}
nframe = pd.DataFrame(ndf, index = [clusters] , columns = ['indexes','segments','cluster','rank'])
nframe['cluster'].value_counts()

In [None]:
ncomps = 10
svd = TruncatedSVD(n_components=ncomps)
svd_fit = svd.fit(smatrix.todense())
Y = svd.fit_transform(smatrix.todense()) 
ax = pd.Series(svd_fit.explained_variance_ratio_.cumsum()).plot(kind='line', figsize=(10,3)).set_ylim([0,1.1])
print('Variance preserved by first 50 components == {:.2%}'.format(
        svd_fit.explained_variance_ratio_.cumsum()[-1]))

In [None]:
dfsvd = pd.DataFrame(Y, columns=['c{}'.format(c) for c in range(ncomps)], index=nframe.index)
print(dfsvd.shape)
dfsvd.head()

In [None]:
plotdims = 5
ploteorows = 1
dfsvdplot = dfsvd[svdcols].iloc[:,:plotdims]
dfsvdplot['cluster'] = nframe['cluster']
ax = sns.pairplot(dfsvdplot.iloc[::ploteorows,:], hue='cluster', size=2.5)

In [None]:
svdcols = [c for c in dfsvd.columns if c[0] == 'c']
Z = TSNE().fit_transform(dfsvd[svdcols])
dftsne = pd.DataFrame(Z, columns=['x','y'], index=dfsvd.index)
ax = sns.lmplot('x', 'y', dftsne, fit_reg=False, size=8
                ,scatter_kws={'alpha':0.7,'s':60})

In [None]:
dftsne['cluster'] = nframe['cluster']
g = sns.lmplot('x', 'y', dftsne, hue='cluster', fit_reg=False, size=8
                ,scatter_kws={'alpha':0.7,'s':60})
g.axes.flat[0].set_title('Scatterplot of a 20D dataset reduced to 2D using t-SNE')