# Singular Value Decomposition - I

In [None]:
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd
import sklearn.datasets as sk_data
import sklearn.metrics as metrics
from sklearn.cluster import KMeans



#import matplotlib as mpl
import seaborn as sns
%matplotlib inline

## Generating low rank data

In [None]:
data = sk_data.make_low_rank_matrix(n_samples=100, n_features=50, effective_rank=10, tail_strength=0.1, random_state=None)
sns.heatmap(data, xticklabels=False, yticklabels=False, linewidths=0)

## Numpy svd :  http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.svd.html

In [None]:
U, s, V = np.linalg.svd(data)
print U.shape, s.shape, V.shape
plt.plot(s)
plt.ylabel('eigenvalue value')
plt.xlabel('number of eigenvalues')

In [None]:
print s

In [None]:
errors = np.zeros(50)
for i in range(50):
    s[-1:-(i+1):-1]=np.zeros(i)
    S = np.diag(s[0:50])
    S = np.vstack([S, np.zeros((50,50)) ])
    approx_d = np.dot(U, np.dot(S,V))
    errors[i] = np.linalg.norm(data-approx_d)
print errors

In [None]:
plt.plot(errors)
plt.ylabel('Error')
plt.xlabel('# of ignored singular values')

## Using real data (20 Newsgroup data)

In [None]:
from sklearn.datasets import fetch_20newsgroups

categories = ['talk.religion.misc', 'sci.space','rec.sport.baseball']
news_data = fetch_20newsgroups(subset='train', categories=categories)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', min_df=0.2)
vectors = vectorizer.fit_transform(news_data.data)

In [None]:
print type(vectors), vectors.shape

In [None]:
U,s,V = sp.sparse.linalg.svds(vectors,10)


### Sparse SVD :  http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.sparse.linalg.svds.html

In [None]:
import scipy.sparse.linalg as linalg

U,s,V = linalg.svds(vectors,10,which='LM')
print U.shape, V.shape, s.shape
print s

In [None]:
plt.plot(range(1,len(s)+1),s[::-1])
plt.ylabel('eigenvalue value')
plt.xlabel('number of eigenvalues')

In [None]:
# Code for setting the style of the notebook
from IPython.core.display import HTML
def css_styling():
    styles = open("../theme/custom.css", "r").read()
    return HTML(styles)
css_styling()