#### In this example, we illustrate various unsupervised learning techniques (Clustering, PCA, SVD) using an example term-document matrix as the data. 

In [None]:
import numpy as np
import pylab as pl
import pandas as pd
from sklearn.cluster import KMeans 

In [None]:
Data = pd.read_csv("../data/term-doc-mat.csv", header=None)

In [None]:
Data

In [None]:
TD = Data.iloc[:,1:]
TD

In [None]:
terms = Data.iloc[:,0]
terms

#### First, we want to do some document clustering. Since the data is in term-document format, we need to obtain the transpose of the TD matrix.

In [None]:
DT = TD.T

#### Now we have a document-term matrix:

In [None]:
DT

In [None]:
numTerms=len(terms)
numTerms

#### Next, we will transform the data to TFxIDF weights (this is not necessary for clustering, but we will do it here for illustration purposes):

In [None]:
# Find doucment frequencies for each term
DF = np.array([(DT!=0).sum(0)])
print (DF)

In [None]:
NDocs = len(DT[0])
print (NDocs)

In [None]:
# Create a matrix with all entries = NDocs
NMatrix=np.ones(np.shape(DT), dtype=float)*NDocs

In [None]:
# Convert each entry into IDF values
# Note that IDF is only a function of the term, so all rows will be identical.
DivM = np.divide(NMatrix, DF)
IDF = np.log2(DivM)

In [None]:
np.set_printoptions(precision=2,suppress=True)
print (IDF[0:2,])

In [None]:
# Finally compute the TFxIDF values for each document-term entry
DT_tfidf = DT * IDF

In [None]:
DT_tfidf

#### Now we are ready for clustering. We'll use the kMeans module of the MLA book from Ch. 10.

In [None]:
cd "../data"

In [None]:
import kMeans
import importlib

In [None]:
importlib.reload(kMeans)

In [None]:
DT_tfidf = np.array(DT_tfidf)
centroids_tfidf, clusters_tfidf = kMeans.kMeans(DT_tfidf, 3, kMeans.distEclud, kMeans.randCent)

#### Let's take a look at the cluster centroids

In [None]:
centroids_tfidf = np.array(centroids_tfidf)
clusters_tfidf = np.array(clusters_tfidf) 
print ("\t\tCluster0\tCluster1\tCluster2")
for i in range(len(terms)):
    print ("%10s\t%.4f\t\t%.4f\t\t%.4f" %(terms[i],centroids_tfidf[0][i],centroids_tfidf[1][i],centroids_tfidf[2][i]))

#### Because the centroids are based on TFxIDF weights, they are not as descriptive as raw term frequencies or binary occurrence data. Let's redo the clustering with the original raw term frequencies.

In [None]:
DT = np.array(DT)
centroids, clusters = kMeans.kMeans(DT, 3, kMeans.distCosine, kMeans.randCent)

In [None]:
centroids = np.array(centroids)
clusters = np.array(clusters)
print ("\t\tCluster0\tCluster1\tCluster2")
for i in range(len(terms)):
    print ("%10s\t%.4f\t\t%.4f\t\t%.4f" %(terms[i],centroids[0][i],centroids[1][i],centroids[2][i]))

#### The cluster centroids reveal some general patterns in the data as well as unique characterisitcs of each cluster. For example, it's clear that Cluster 0 is dominated by documents related to SQL databases while Cluster 1 contains documents primarily related to linear regresssion, etc.

In [None]:
# Let's look at cluster assigmens for each of the instances in the data.
print (clusters)

In [None]:
print (centroids)

#### Next, let's use principal component analysis to reduce the dimensionality of the data:

In [None]:
from sklearn import decomposition

#### We'll perform PCA to obtain the top 5 components and then transform the DT matrix into the lower dimensional space of 5 components:

In [None]:
pca = decomposition.PCA(n_components=5)
DTtrans = pca.fit(DT).transform(DT)

In [None]:
np.set_printoptions(precision=2,suppress=True)
print (DTtrans)

In [None]:
print(pca.explained_variance_ratio_)

#### Looking at the above, it can be obsereved that the first 5 components capture (explain) 95% of the variance in the data.

#### Now, we can redo the clustering, but this time in the lower dimensional space:

In [None]:
centroids_pca, clusters_pca = kMeans.kMeans(DTtrans, 3, kMeans.distCosine, kMeans.randCent)

In [None]:
print (clusters_pca)

#### Next, let's actually derive the principal components manaually using linear algebra rather than relying on the PCA package from sklearn:

#### First step is to obtain the covariance matrix:

In [None]:
meanVals = np.mean(DT, axis=0)
meanRemoved = DT - meanVals #remove mean
covMat = np.cov(meanRemoved, rowvar=0)

np.set_printoptions(precision=2,suppress=True,linewidth=100)
print (covMat)

In [None]:
import numpy.linalg as la
eigVals,eigVects = la.eig(np.mat(covMat))

In [None]:
print (eigVals)

In [None]:
print (eigVects)

In [None]:
eigValInd = np.argsort(eigVals)  #sort, sort goes smallest to largest
eigValInd = eigValInd[::-1]   #reverse
sortedEigVals = eigVals[eigValInd]
print (sortedEigVals)
total = sum(sortedEigVals)
varPercentage = sortedEigVals/total*100
print (varPercentage)

#### We can plot the principal components based on the percentage of variance they capture:

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(range(1, 11), varPercentage[:10], marker='^')
plt.xlabel('Principal Component Number')
plt.ylabel('Percentage of Variance')
plt.show()

In [None]:
topNfeat = 5
topEigValInd = eigValInd[:topNfeat]  #cut off unwanted dimensions
reducedEigVects = eigVects[:,topEigValInd]   #reorganize eig vects largest to smallest
reducedDT = np.dot(meanRemoved, reducedEigVects)    #transform data into new dimensions
print (reducedDT)

#### Next, let's look at an application of Singular Value Decomposition. This time, we'll foucs on the term-document matrix in order to find themes based on combinations of terms.

In [None]:
u, s, vt = np.linalg.svd(TD, full_matrices=False)

In [None]:
u = np.array([u[i]*(-1) for i in range(len(u))])
print (u)

In [None]:
vt = np.array([vt[i]*(-1) for i in range(len(vt))])
print (vt)

In [None]:
print (s)

In [None]:
print (np.diag(s))

In [None]:
# If we use all the dimensions of U.Sigma.Vt, we will get back to original matrix.

originalTD = np.dot(u, np.dot(np.diag(s), vt))
print (originalTD)

In [None]:
# But, the goal of SVD is to use a smaller number of dimensions each of which 
# represent a latent variable capturing some cobminations of features associated 
# with the data (e.g., general themes in the documents). 

numDimensions = 3
u_ld = u[:, :numDimensions]
sigma = np.diag(s)[:numDimensions, :numDimensions]
vt_ld = vt[:numDimensions, :]
lowRankTD = np.dot(u_ld, np.dot(sigma, vt_ld))

In [None]:
# The U.Sigma.Vt in the lower dimensional space gives an approximation of the original materix

np.set_printoptions(precision=2,suppress=True,linewidth=120)
print (lowRankTD)

#### The VT matrix can be viewed as the new representation of documents in the lower dimensional space.

In [None]:
print (vt_ld)

#### In information retrieval, a query is compared to documents using vector-space similarity between the query vector and document vectors. In the lower dim. space, this can be achieved by first mapping the query to lower dim. space, and then comparing it to docs in the lower dim. space.

In [None]:
queryVector = np.array([0,0,1,5,4,0,6,0,0,2])
lowDimQuery = np.dot(la.inv(sigma), np.dot(u_ld.T, queryVector))
print (lowDimQuery)


In [None]:
# Compute Cosine sim between the query and docs in the lower dimensional space

qNorm = lowDimQuery / la.norm(lowDimQuery)

In [None]:
docNorm = np.array([vt_ld[:,i]/la.norm(vt_ld[:,i]) for i in range(len(vt_ld[0]))])		
print (docNorm)

In [None]:
sims = np.dot(qNorm, docNorm.T)
# return indices of the docs in decending order of similarity to the query
simInds = sims.argsort()[::-1]
for i in simInds:
    print ("Cosine similarity between Document %d and the query is: %.4f" %(i,sims[i]))

In [None]:
centroids_svd, clusters_svd = kMeans.kMeans(vt_ld.T, 3, kMeans.distCosine, kMeans.randCent)

In [None]:
print (clusters_svd)

In [None]:
centroids_svd = np.array(centroids_svd)
clusters_svd = np.array(clusters_svd)
print ("\t\tCluster0\tCluster1\tCluster2")
for i in range(numDimensions):
    print ("Theme %d\t\t%.4f\t\t%.4f\t\t%.4f" %(i,centroids_svd[0][i],centroids_svd[1][i],centroids_svd[2][i]))