In [2]:
###SVD
# Singular-value decomposition
from numpy import array
from scipy.linalg import svd
# define a matrix
A = array([[1, 2], [3, 4], [5, 6]])
print(A)
# SVD
U, s, VT = svd(A)
print("Matrix U - Left-singular Vectors of A:")
print(U)
print("Sigma: Diagonal Matrix:")
print(s)
print("V^T: Right-singular vectors of A:")
print(VT) 


[[1 2]
 [3 4]
 [5 6]]
Matrix U - Left-singular Vectors of A:
[[-0.2298477   0.88346102  0.40824829]
 [-0.52474482  0.24078249 -0.81649658]
 [-0.81964194 -0.40189603  0.40824829]]
Sigma: Diagonal Matrix:
[9.52551809 0.51430058]
V^T: Right-singular vectors of A:
[[-0.61962948 -0.78489445]
 [-0.78489445  0.61962948]]


In [3]:
### Reconstructing 

from numpy import array
from numpy import diag
from numpy import dot
from numpy import zeros
from scipy.linalg import svd
# define a matrix
A = array([[1, 2], [3, 4], [5, 6]])
print(A)
# Singular-value decomposition
U, s, VT = svd(A)
# create m x n Sigma matrix
Sigma = zeros((A.shape[0], A.shape[1]))
# populate Sigma with n x n diagonal matrix
Sigma[:A.shape[1], :A.shape[1]] = diag(s)
# reconstruct matrix
B = U.dot(Sigma.dot(VT))
print(B)

[[1 2]
 [3 4]
 [5 6]]
[[1. 2.]
 [3. 4.]
 [5. 6.]]


### Document insights - Collocations

In [56]:
from nltk.corpus import gutenberg
import nltk
from operator import itemgetter
import re
import string
from unicodedata import normalize
from numpy import array
from nltk.corpus import stopwords

In [57]:
stops = set(stopwords.words('english'))

In [40]:
alice = gutenberg.sents(fileids="carroll-alice.txt")
alice = [' '.join(ts) for ts in alice]
alice[0]

"[ Alice ' s Adventures in Wonderland by Lewis Carroll 1865 ]"

In [41]:
def flatten_corpus(corpus):
    return ' '.join([document.strip() for document in corpus])

In [58]:
# clean a list of lines
def clean(corpus):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for line in corpus:
        clean_pair = list()
        # normalize unicode characters
        line = normalize('NFD', line).encode('ascii', 'ignore')
        line = line.decode('UTF-8')
        # tokenize on white space
        line = line.split()
        # convert to lowercase
        line = [word.lower() for word in line if word not in stops]
        # remove punctuation from each token
        line = [word.translate(table) for word in line]
        # remove non-printable chars form each token
        line = [re_print.sub('', w) for w in line]
        # remove tokens with numbers in them
        line = [word for word in line if word.isalpha()]
        # store as string
        cleaned.append(' '.join(line))
        #cleaned.append(line)
    return cleaned

In [59]:
cleaned = clean(alice)

In [60]:
cleaned

['alice adventures wonderland lewis carroll',
 'chapter i',
 'down rabbit hole',
 'alice beginning get tired sitting sister bank nothing twice peeped book sister reading pictures conversations use book thought alice without pictures conversation',
 'so considering mind well could hot day made feel sleepy stupid whether pleasure making daisy chain would worth trouble getting picking daisies suddenly white rabbit pink eyes ran close',
 'there nothing very remarkable alice think very much way hear rabbit say oh dear',
 'oh dear',
 'i shall late',
 'thought afterwards occurred ought wondered time seemed quite natural rabbit actually took a watch out of its waistcoat pocket looked hurried alice started feet flashed across mind never seen rabbit either waistcoat pocket watch take burning curiosity ran across field fortunately time see pop large rabbit hole hedge',
 'in another moment went alice never considering world get',
 'the rabbit hole went straight like tunnel way dipped suddenly sudd

In [61]:
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import BigramAssocMeasures

In [62]:
finder = BigramCollocationFinder.from_documents([item.split() for item in cleaned])

In [63]:
bigram_measures = BigramAssocMeasures()

In [64]:
#raw freqs
finder.nbest(bigram_measures.raw_freq, 10)

[('said', 'alice'),
 ('mock', 'turtle'),
 ('march', 'hare'),
 ('said', 'king'),
 ('i', 'think'),
 ('thought', 'alice'),
 ('i', 'know'),
 ('said', 'hatter'),
 ('white', 'rabbit'),
 ('said', 'mock')]

In [65]:
#pointwise mutual information
finder.nbest(bigram_measures.pmi, 10)

[('abide', 'figures'),
 ('acceptance', 'elegant'),
 ('accounting', 'tastes'),
 ('accustomed', 'usurpation'),
 ('act', 'crawling'),
 ('adjourn', 'immediate'),
 ('adoption', 'energetic'),
 ('agony', 'terror'),
 ('alarmed', 'proposal'),
 ('ambition', 'distraction')]

In [66]:
#trigrams 
from nltk.collocations import TrigramCollocationFinder
from nltk.collocations import TrigramAssocMeasures

finder = TrigramCollocationFinder.from_documents([item.split() for item in cleaned])

In [67]:
trigram_measures = TrigramAssocMeasures()

In [68]:
finder.nbest(trigram_measures.raw_freq, 10)

[('said', 'mock', 'turtle'),
 ('said', 'march', 'hare'),
 ('said', 'alice', 'i'),
 ('i', 'beg', 'pardon'),
 ('i', 'wish', 'i'),
 ('poor', 'little', 'thing'),
 ('i', 'think', 'i'),
 ('little', 'golden', 'key'),
 ('march', 'hare', 'said'),
 ('mock', 'turtle', 'said')]

In [69]:
finder.nbest(trigram_measures.pmi, 10)

[('accustomed', 'usurpation', 'conquest'),
 ('adjourn', 'immediate', 'adoption'),
 ('adoption', 'energetic', 'remedies'),
 ('ancient', 'modern', 'seaography'),
 ('apple', 'roast', 'turkey'),
 ('arithmetic', 'ambition', 'distraction'),
 ('brother', 'latin', 'grammar'),
 ('canvas', 'bag', 'tied'),
 ('cherry', 'tart', 'custard'),
 ('circle', 'exact', 'shape')]