In [1]:
# For wide monitor
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import numpy as np

In [3]:
A = np.array([[3,7],
             [5,2],
             [1,2]])

In [4]:
A

array([[3, 7],
       [5, 2],
       [1, 2]])

In [5]:
A.shape

(3, 2)

In [6]:
A.T

array([[3, 5, 1],
       [7, 2, 2]])

In [7]:
A.shape

(3, 2)

In [8]:
AAT = np.dot(A, A.T)

In [9]:
AAT

array([[58, 29, 17],
       [29, 29,  9],
       [17,  9,  5]])

In [10]:
U,D,V=np.linalg.svd(A, full_matrices=False)

In [11]:
U

array([[-0.82696754,  0.49546393],
       [-0.50578662, -0.86201864],
       [-0.2455699 ,  0.10695492]])

In [12]:
V.T

array([[-0.58471028, -0.81124219],
       [-0.81124219,  0.58471028]])

In [13]:
D = np.diag(D)

In [14]:
D

array([[8.98805064, 0.        ],
       [0.        , 3.34887231]])

In [15]:
np.dot(np.dot(U,D), V.T)

array([[3., 7.],
       [5., 2.],
       [1., 2.]])

In [16]:
np.linalg.eig(AAT)

(array([8.07850543e+01, 1.12149457e+01, 8.87848878e-16]),
 array([[-0.82696754, -0.49546393, -0.2657822 ],
        [-0.50578662,  0.86201864, -0.03322277],
        [-0.2455699 , -0.10695492,  0.96346047]]))

In [17]:
ATA = np.dot(A.T, A)

In [18]:
ATA

array([[35, 33],
       [33, 57]])

In [19]:
values, V1 = np.linalg.eig(ATA)

In [20]:
V1

array([[-0.81124219, -0.58471028],
       [ 0.58471028, -0.81124219]])

In [21]:
V

array([[-0.58471028, -0.81124219],
       [-0.81124219,  0.58471028]])

In [22]:
V1.T

array([[-0.81124219,  0.58471028],
       [-0.58471028, -0.81124219]])

In [23]:
values

array([11.21494574, 80.78505426])

In [24]:
D

array([[8.98805064, 0.        ],
       [0.        , 3.34887231]])

In [25]:
text = 'apple banana apple banana orange '+ 'apple orange banana orange '+'orange apple apple banana apple '+'caret spinach eggplant caret '+'spinach caret potato spinach '+'caret potato eggplant eggplant'

In [26]:
text

'apple banana apple banana orange apple orange banana orange orange apple apple banana apple caret spinach eggplant caret spinach caret potato spinach caret potato eggplant eggplant'

In [28]:
words = text.split()

In [29]:
sorted(set(words))

['apple', 'banana', 'caret', 'eggplant', 'orange', 'potato', 'spinach']

In [30]:
from sklearn.feature_extraction.text import CountVectorizerctorizer

In [31]:
def tf_extractor(corpus):
    # returns a frequency-based DTM
    vectorizer = CountVectorizer(min_df=1, ngram_range=(1,1))
    # min_df: minimum document frequency threshold
         # if min_df = 1 -> 최소 적어도 하나의 문서에서 사용된 단어들은 모두 포함
    # ngram_range=(1,1) -> unigram 만 포함
    # ngram_range=(1,3) -> unigram, bi-gram, tri-gram 까지 고려
    features = vectorizer.fit_transform(corpus) # transform texts to a frequency matrix
    return vectorizer, features  

In [32]:
CORPUS = [
    'apple banana apple banana orange', 
    'apple orange banana orange', 
    'orange apple apple banana apple',
    'carrot spinach eggplant carrot',
    'spinach carrot potato spinach',
    'carrot potato eggplant eggplant'
]

In [33]:
vec, features = tf_extractor(CORPUS)

In [35]:
vec.get_feature_names()

['apple', 'banana', 'carrot', 'eggplant', 'orange', 'potato', 'spinach']

In [36]:
dtm = np.array(features.todense())

In [37]:
dtm

array([[2, 2, 0, 0, 1, 0, 0],
       [1, 1, 0, 0, 2, 0, 0],
       [3, 1, 0, 0, 1, 0, 0],
       [0, 0, 2, 1, 0, 0, 1],
       [0, 0, 1, 0, 0, 1, 2],
       [0, 0, 1, 2, 0, 1, 0]], dtype=int64)

In [38]:
from sklearn.decomposition import TruncatedSVD

In [39]:
svd_model = TruncatedSVD(n_components=2, algorithm='randomized', n_iter=10)

In [40]:
svd_matrix = svd_model.fit_transform(dtm)

In [41]:
svd_matrix.shape

(6, 2)

In [42]:
svd_matrix

array([[ 2.91700508e+00, -0.00000000e+00],
       [ 2.12791364e+00,  1.06597249e-15],
       [ 3.19187046e+00, -4.44280679e-16],
       [-1.90104100e-16,  2.29412500e+00],
       [ 1.29196961e-17,  1.93410872e+00],
       [-2.27017518e-16,  1.93410872e+00]])

In [43]:
np.set_printoptions(formatter={'float_kind':'{:f}'.format})

In [44]:
svd_matrix

array([[2.917005, -0.000000],
       [2.127914, 0.000000],
       [3.191870, -0.000000],
       [-0.000000, 2.294125],
       [0.000000, 1.934109],
       [-0.000000, 1.934109]])

In [45]:
from sklearn.utils.extmath import randomized_svd

U, Sigma, VT = randomized_svd(dtm, n_components=2, n_iter=5, random_state=None)

In [46]:
U

array([[0.605285, -0.000000],
       [0.441546, 0.000000],
       [0.662320, -0.000000],
       [0.000000, 0.642621],
       [0.000000, 0.541774],
       [0.000000, 0.541774]])

In [47]:
Sigma

array([4.819229, 3.569953])

In [48]:
print(VT)

[[0.755115 0.480250 0.000000 0.000000 0.446274 0.000000 0.000000]
 [-0.000000 -0.000000 0.663535 0.483527 0.000000 0.303519 0.483527]]


In [49]:
U, D, VT1 = np.linalg.svd(dtm)

In [50]:
U.shape

(6, 6)

In [51]:
U

array([[-0.605285, -0.000000, -0.000000, 0.000000, -0.000000, -0.796009],
       [-0.441546, 0.000000, 0.000000, -0.832050, 0.000000, 0.335751],
       [-0.662320, 0.000000, 0.000000, 0.554700, 0.000000, 0.503627],
       [0.000000, 0.642621, 0.000000, 0.000000, 0.766185, 0.000000],
       [0.000000, 0.541774, -0.707107, 0.000000, -0.454401, 0.000000],
       [0.000000, 0.541774, 0.707107, 0.000000, -0.454401, 0.000000]])

In [52]:
VT1.T

array([[-0.755115, -0.000000, 0.000000, 0.588348, -0.000000, 0.289218,
        0.000000],
       [-0.480250, -0.000000, 0.000000, -0.196116, -0.000000, -0.854926,
        0.000000],
       [0.000000, 0.663535, 0.000000, 0.000000, 0.556526, 0.000000,
        -0.500000],
       [0.000000, 0.483527, 0.707107, 0.000000, -0.127285, 0.000000,
        0.500000],
       [-0.446274, 0.000000, 0.000000, -0.784465, 0.000000, 0.430645,
        0.000000],
       [0.000000, 0.303519, -0.000000, 0.000000, -0.811096, 0.000000,
        -0.500000],
       [0.000000, 0.483527, -0.707107, 0.000000, -0.127285, 0.000000,
        0.500000]])

In [53]:
from gensim import corpora
from gensim.models import LsiModel

In [54]:
def build_doc_word_matrix(docs):
    dictionary = corpora.Dictionary(docs)
    corpus = []
    for doc in docs:
        bow = dictionary.doc2bow(doc)
        corpus.append(bow)
        
    return corpus, dictionary

In [55]:
docs_words = []
for doc in CORPUS:
    docs_words.append(doc.split())

In [56]:
docs_words

[['apple', 'banana', 'apple', 'banana', 'orange'],
 ['apple', 'orange', 'banana', 'orange'],
 ['orange', 'apple', 'apple', 'banana', 'apple'],
 ['carrot', 'spinach', 'eggplant', 'carrot'],
 ['spinach', 'carrot', 'potato', 'spinach'],
 ['carrot', 'potato', 'eggplant', 'eggplant']]

In [57]:
corpus, dictionary = build_doc_word_matrix(docs_words)

In [58]:
model = LsiModel(corpus, num_topics=2, id2word=dictionary)

In [59]:
corpus

[[(0, 2), (1, 2), (2, 1)],
 [(0, 1), (1, 1), (2, 2)],
 [(0, 3), (1, 1), (2, 1)],
 [(3, 2), (4, 1), (5, 1)],
 [(3, 1), (5, 2), (6, 1)],
 [(3, 1), (4, 2), (6, 1)]]

In [60]:
model.num_topics

2