In [63]:
# hack to import local classes
import sys
sys.path.append('..')

%load_ext autoreload
%autoreload 2

from src.models import train_model
from src.data import read_transform
from sklearn.metrics import calinski_harabaz_score

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack, save_npz, load_npz

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(style="whitegrid")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [64]:
#Read CSR matrix from the input file
csrMatrix = read_transform.csr_read('../data/raw/train.dat')

#Scale the CSR matrix by idf (Inverse Document Frequency)
csrIDF = read_transform.csr_idf(csrMatrix, copy=True)

#Normalize the rows of a CSR matrix by their L-2 norm.
csrL2Normalized = read_transform.csr_l2normalize(csrIDF, copy=True)

#Obtain a dense ndarray representation of the CSR matrix.
denseMatrix = csrL2Normalized.toarray()

In [45]:
csrL2Normalized.shape

(8580, 126355)

In [9]:
pd.DataFrame(denseMatrix).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,126345,126346,126347,126348,126349,126350,126351,126352,126353,126354
0,0.0,0.0,0.0,0.110212,0.0,0.003722,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.034005,0.008336,0.0,0.0,0.057396,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.007764,0.0,0.0,0.004277,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.004907,0.0,0.0,0.067576,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.02532,0.030651,0.050668,0.00621,0.0,0.0,0.034208,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
labels = train_model.bisecting_kmeans(denseMatrix, 2, 10)

In [7]:
pd.DataFrame(labels)[0].value_counts()

1    4752
2    3828
Name: 0, dtype: int64

In [8]:
calinski_harabaz_score(denseMatrix, labels)

76.27600389036283

# LSA - Latent Semantic Analysis

In [10]:
from sklearn.decomposition import TruncatedSVD

In [73]:
svd = TruncatedSVD(n_components=5000, n_iter=10, random_state=10, algorithm='arpack')

In [74]:
csrL2Normalized_svd = svd.fit_transform(csrL2Normalized)

In [75]:
svd.explained_variance_ratio_.sum() * 100

96.12863835303189

In [76]:
pd.DataFrame(csrL2Normalized_svd).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,0.133729,-0.031597,0.019048,0.001199,-0.007876,0.047543,-0.09697,-0.05919,-0.034899,-0.037521,...,-0.009368,-0.001655,0.001102,-0.002816,0.003566,-0.00409,0.003332,-0.004951,-0.003348,0.008441
1,0.15667,0.062818,0.018526,0.013523,0.007401,0.020389,-0.080919,0.015974,-0.07419,0.055908,...,0.003735,-0.005269,0.006351,0.007306,-0.001349,-0.006252,-0.003279,0.005156,0.002852,-0.006525
2,0.074912,-0.012769,-0.001249,0.005575,0.001644,0.010845,-0.041064,-0.01912,-0.012199,0.009542,...,-0.00095,-0.000795,0.001003,0.000142,0.001128,0.000106,-1.3e-05,0.001234,-0.001573,0.000696
3,0.180085,-0.052568,0.008716,0.010358,-0.006335,0.063469,-0.103017,-0.052676,-0.02418,-0.041249,...,0.002952,-0.014552,-0.001137,0.004211,0.006335,0.000585,-0.006573,-0.004551,-0.005729,0.003687
4,0.154468,-0.007684,0.009544,0.006078,-0.006014,0.040344,-0.092864,-0.058367,-0.027375,-0.02532,...,-0.004343,0.005205,0.001742,0.006523,-0.010088,0.000477,0.001375,0.006387,-0.002781,-0.011714


In [77]:
csrL2Normalized_svd = csr_matrix(csrL2Normalized_svd)

In [78]:
csrL2Normalized_svd

<8580x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 42900000 stored elements in Compressed Sparse Row format>

In [79]:
csrL2Normalized_svd = read_transform.csr_l2normalize(csrL2Normalized_svd, copy=True)

In [80]:
pd.DataFrame(csrL2Normalized_svd.toarray()).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,0.135742,-0.032073,0.019335,0.001217,-0.007995,0.048259,-0.09843,-0.060081,-0.035424,-0.038086,...,-0.009509,-0.00168,0.001118,-0.002859,0.003619,-0.004152,0.003383,-0.005025,-0.003399,0.008568
1,0.158398,0.063511,0.01873,0.013672,0.007483,0.020614,-0.081812,0.01615,-0.075008,0.056525,...,0.003776,-0.005327,0.006421,0.007387,-0.001364,-0.006321,-0.003316,0.005213,0.002884,-0.006597
2,0.074945,-0.012774,-0.001249,0.005577,0.001645,0.01085,-0.041082,-0.019128,-0.012204,0.009546,...,-0.00095,-0.000795,0.001003,0.000142,0.001129,0.000107,-1.3e-05,0.001235,-0.001574,0.000696
3,0.184147,-0.053754,0.008912,0.010591,-0.006478,0.0649,-0.10534,-0.053864,-0.024726,-0.042179,...,0.003018,-0.01488,-0.001162,0.004306,0.006478,0.000598,-0.006721,-0.004653,-0.005858,0.003771
4,0.156999,-0.00781,0.0097,0.006177,-0.006112,0.041005,-0.094385,-0.059323,-0.027823,-0.025735,...,-0.004414,0.005291,0.001771,0.006629,-0.010254,0.000485,0.001398,0.006492,-0.002826,-0.011906


In [89]:
save_npz('../data/interim/csr_svd_normalized_5000-arpack.npz', csrL2Normalized_svd)

# Bisect on SVD

In [81]:
denseMatrix = csrL2Normalized_svd.toarray()

In [82]:
labels = train_model.bisecting_kmeans(denseMatrix, 2, 10)

In [83]:
pd.DataFrame(labels)[0].value_counts()

1    4787
2    3793
Name: 0, dtype: int64

In [84]:
calinski_harabaz_score(denseMatrix, labels)

80.2996914517128

# Submission

In [104]:
denseMatrix = csrL2Normalized_svd.toarray()

In [105]:
labels = train_model.bisecting_kmeans(denseMatrix, 7, 10)

In [106]:
pd.DataFrame(labels)[0].value_counts()

5    2058
1    1657
6    1516
4    1072
7     928
3     918
2     431
Name: 0, dtype: int64

In [107]:
calinski_harabaz_score(csrL2Normalized.toarray(), labels)

44.18652047965448

In [88]:
read_transform.write_predictions(labels, '../models/predictions/1.5-am-lsa-arpack.dat')

In [69]:
denseMatrix = csrL2Normalized.toarray()

In [70]:
labels = train_model.bisecting_kmeans(denseMatrix, 7, 10)

In [71]:
pd.DataFrame(labels)[0].value_counts()

5    2035
3    1963
1    1427
6    1075
7    1039
2     675
4     366
Name: 0, dtype: int64

In [72]:
calinski_harabaz_score(csrL2Normalized.toarray(), labels)

46.035432919341474

# Non Negative Matrix Factorization

In [3]:
from sklearn.decomposition import NMF

In [28]:
nmf = NMF(n_components=550, random_state=10, alpha=.1, l1_ratio=.5, verbose=True)

In [29]:
csrL2Normalized_nmf = nmf.fit_transform(csrL2Normalized)

violation: 1.0
violation: 0.007381502004166063
violation: 0.006559960891150529
violation: 0.003928112329562171
violation: 0.0025834742644676506
violation: 0.0018327749861842082
violation: 0.0013509157618990767
violation: 0.0009910607203590502
violation: 0.0007753259203580676
violation: 0.0006189964680479151
violation: 0.0004975888464482741
violation: 0.000409449091168145
violation: 0.00034712471504103856
violation: 0.00030310845303377766
violation: 0.00026775325872107735
violation: 0.0002418054809956908
violation: 0.0002250083152105965
violation: 0.000214720340370855
violation: 0.00020683478220220988
violation: 0.00019641318267255475
violation: 0.0001820108464820063
violation: 0.0001697237926184946
violation: 0.0001603272899390658
violation: 0.00015421006783794964
violation: 0.00014886539511119634
violation: 0.00014370965060167047
violation: 0.0001383410885760771
violation: 0.00013342096964458818
violation: 0.00012913063888743645
violation: 0.0001255220079701261
violation: 0.0001223994

In [30]:
pd.DataFrame(csrL2Normalized_nmf).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,540,541,542,543,544,545,546,547,548,549
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021037,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004375,0.0,0.0,...,0.006787,0.0,0.000647,0.001012,0.0,0.0,0.0,0.026102,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016328,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011974,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033765,0.0,0.0


In [31]:
csrL2Normalized_nmf = csr_matrix(csrL2Normalized_nmf)

In [41]:
csrL2Normalized_nmf

<8580x550 sparse matrix of type '<class 'numpy.float64'>'
	with 59146 stored elements in Compressed Sparse Row format>

In [33]:
csrL2Normalized_nmf = read_transform.csr_l2normalize(csrL2Normalized_nmf, copy=True)

In [34]:
pd.DataFrame(csrL2Normalized_nmf.toarray()).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,540,541,542,543,544,545,546,547,548,549
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.275199,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.097274,0.0,0.0,...,0.150926,0.0,0.014391,0.022506,0.0,0.0,0.0,0.580426,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.827395,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.105748,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.403336,0.0,0.0


In [35]:
save_npz('../data/interim/csr_nmf_normalized_550.npz', csrL2Normalized_nmf)

# Bisect on NMF 500

In [65]:
csrL2Normalized_nmf = load_npz('../data/interim/csr_nmf_normalized_500.npz')

In [66]:
csrL2Normalized_nmf

<8580x500 sparse matrix of type '<class 'numpy.float64'>'
	with 61668 stored elements in Compressed Sparse Row format>

In [67]:
denseMatrix = csrL2Normalized_nmf.toarray()

In [68]:
labels = train_model.bisecting_kmeans(denseMatrix, 7, 10)

In [69]:
pd.DataFrame(labels)[0].value_counts()

2    1799
7    1668
5    1619
4    1259
3    1151
1     755
6     329
Name: 0, dtype: int64

In [70]:
calinski_harabaz_score(csrL2Normalized.toarray(), labels)

48.47681059943502

In [71]:
read_transform.write_predictions(labels, '../models/predictions/1.5-am-lsa-nmf-500-euclid.dat')

# Bisect on NMF 550

In [49]:
csrL2Normalized_nmf = load_npz('../data/interim/csr_nmf_normalized_550.npz')

In [50]:
csrL2Normalized_nmf

<8580x550 sparse matrix of type '<class 'numpy.float64'>'
	with 59146 stored elements in Compressed Sparse Row format>

In [51]:
denseMatrix = csrL2Normalized_nmf.toarray()

In [52]:
labels = train_model.bisecting_kmeans(denseMatrix, 7, 10)

In [53]:
pd.DataFrame(labels)[0].value_counts()

7    1900
1    1882
3    1854
4    1186
2     852
6     455
5     451
Name: 0, dtype: int64

In [54]:
calinski_harabaz_score(csrL2Normalized.toarray(), labels)

50.473313630436046

In [40]:
read_transform.write_predictions(labels, '../models/predictions/1.5-am-lsa-nmf-550.dat')

# Bisect on NMF 1000

In [42]:
csrL2Normalized_nmf = load_npz('../data/interim/csr_nmf_normalized_1000.npz')

In [43]:
csrL2Normalized_nmf

<8580x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 49377 stored elements in Compressed Sparse Row format>

In [44]:
denseMatrix = csrL2Normalized_nmf.toarray()

In [45]:
labels = train_model.bisecting_kmeans(denseMatrix, 7, 10)

In [46]:
pd.DataFrame(labels)[0].value_counts()

7    2186
1    2064
5    1470
4    1352
2     564
6     494
3     450
Name: 0, dtype: int64

In [47]:
calinski_harabaz_score(csrL2Normalized.toarray(), labels)

42.05549637019858

In [48]:
read_transform.write_predictions(labels, '../models/predictions/1.5-am-lsa-nmf-1000.dat')