In [94]:
import numpy as np
import scipy as sp
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from collections import defaultdict

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
from scipy.sparse import csr_matrix

In [40]:
def csr_read(filename):
    
    with open(filename) as f:
        lines = f.readlines()
    
    nrows = len(lines)
    nnz = 0 
    for i in range(nrows):
        p = lines[i].split()
        if len(p) % 2 != 0:
            raise ValueError("Invalid data")
        nnz += len(p)//2
        
    val = np.zeros(nnz, dtype=np.float)
    ind = np.zeros(nnz, dtype=np.int)
    ptr = np.zeros(nrows+1, dtype=np.long)
    n = 0 
    for i in range(nrows):
        p = lines[i].split()
        for j in range(0, len(p), 2): 
            ind[n] = int(p[j]) - 1
            val[n] = float(p[j+1])
            n += 1
        ptr[i+1] = n 
    
    return csr_matrix((val, ind, ptr), dtype=np.float)

In [41]:
csr_mat = csr_read("../data/raw/train.dat")

In [42]:
csr_mat

<8580x126355 sparse matrix of type '<class 'numpy.float64'>'
	with 1107980 stored elements in Compressed Sparse Row format>

In [66]:
docs = [["hello", "world", "hello"], ["goodbye", "cruel", "world"], ["aditya", "cruel", "world", "hello"]]
indptr = [0]
indices = []
data = []
vocabulary = {}
for d in docs:
    for term in d:
        index = vocabulary.setdefault(term, len(vocabulary))
        indices.append(index)
        data.append(1)
    indptr.append(len(indices))

csr_mat = csr_matrix((data, indices, indptr), dtype=int)

In [67]:
csr_mat

<3x5 sparse matrix of type '<class 'numpy.int64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [68]:
matrix = csr_mat.todense()

In [69]:
clusters = list()
    
initialcluster = list()
for i in range(matrix.shape[0]):
    initialcluster.append(i)
    
clusters.append(initialcluster)

In [70]:
clusters

[[0, 1, 2]]

In [71]:
SSE_list = list()
SSE_array = []

for cluster in clusters:
    members = matrix[cluster,:]
    SSE = np.sum(np.square(members - np.mean(members)))
    SSE_list.append(SSE)

SSE_array = np.asarray(SSE_list)
dropClusterIndex = np.argsort(SSE_array)[-1]

In [72]:
SSE_list

[5.333333333333332]

In [73]:
members

matrix([[2, 1, 0, 0, 0],
        [0, 1, 1, 1, 0],
        [1, 1, 0, 1, 1]])

In [63]:
SSE

3.5

In [64]:
dropClusterIndex

0

In [74]:
droppedCluster = clusters[dropClusterIndex]

In [75]:
droppedCluster

[0, 1, 2]

In [76]:
matrix[droppedCluster,:]

matrix([[2, 1, 0, 0, 0],
        [0, 1, 1, 1, 0],
        [1, 1, 0, 1, 1]])

In [77]:
matrix

matrix([[2, 1, 0, 0, 0],
        [0, 1, 1, 1, 0],
        [1, 1, 0, 1, 1]])

In [84]:
np.argsort(np.array([2,4,1,3]))

array([2, 0, 3, 1])

In [86]:
from sklearn.utils import shuffle

In [89]:
matrixShuffled = shuffle(matrix, random_state=0)
matrixShuffled[:2,:]

matrix([[1, 1, 0, 1, 1],
        [0, 1, 1, 1, 0]])

In [95]:
from src.data import csr_read

ModuleNotFoundError: No module named 'src'

In [96]:
!which python

/Users/amangal/anaconda3/bin/python


In [97]:
!conda list |grep src

src                       0.1.0                     <pip>


In [98]:
import src

ModuleNotFoundError: No module named 'src'

In [99]:
import os

In [102]:
os.getcwd()

'/Users/amangal/Desktop/machine-learning/cmpe255/assignment2/notebooks'

In [103]:
import sys
sys.path.append('..')

In [104]:
import src

In [118]:
from src.data import read_transform

In [110]:
!conda list |grep src

In [119]:
temp = read_transform.csr_read('../data/raw/train.dat')

In [120]:
temp

<8580x126355 sparse matrix of type '<class 'numpy.float64'>'
	with 1107980 stored elements in Compressed Sparse Row format>

In [122]:
csrIDF = read_transform.csr_idf(temp, copy=True)

In [123]:
csrL2Normalized = read_transform.csr_l2normalize(csrIDF, copy=True)

In [124]:
from src.models import train_model

In [130]:
denseMatrix = csrL2Normalized.toarray()

In [131]:
labels = train_model.bisecting_kmeans(denseMatrix, 3, 3)

In [132]:
from sklearn.metrics import calinski_harabaz_score

In [133]:
calinski_harabaz_score(denseMatrix, labels)

48.28809580175584