In [95]:
import numpy as np

# row:문서, column : 단어인 DTM이라고 가정
X = np.array([[1, 1, 0, 0, 0],
              [0, 1, 0, 0, 0],
              [0, 0, 1, 0, 0],
              [0, 0, 0, 1, 0]])

In [96]:
X.ndim, X.shape

(2, (4, 5))

In [97]:
X[0] # 0번째 문서에 대한 단어의 존재 유무 확인 가능

array([1, 1, 0, 0, 0])

In [98]:
X[:,0] 

array([1, 0, 0, 0])

In [99]:
# 각 문서들 간의 내적을 수행한 후 문서의 길이로 정규화(cosine)
X[0], X[1]
# X[0].dot(X[1]) / (norm(X[0])*X[1]) 을 하고 싶으나 인덱스를 안써야 하는 상황

(array([1, 1, 0, 0, 0]), array([0, 1, 0, 0, 0]))

In [100]:
X

array([[1, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0]])

In [101]:
X.T

array([[1, 0, 0, 0],
       [1, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 0]])

In [102]:
X[0], X.T[:, 0] # T를 했으므로 column에 대해 0으로 호출되어야 함. 

(array([1, 1, 0, 0, 0]), array([1, 1, 0, 0, 0]))

In [103]:
X.shape, X.T.shape

((4, 5), (5, 4))

In [104]:
_X = X.dot(X.T) # TDM이나 DTM할때는 innerproduct라는 변수에 저장했었음. 

In [105]:
# _X / X row마다 normalize해주고, X.T column마다 normalize해줘야 함. 
np.linalg.norm(X, axis=1) # linear algebra에 있는 norm함수 사용. 열에 대해서 수행하므로 axis=1

array([1.41421356, 1.        , 1.        , 1.        ])

In [106]:
from math import sqrt
sqrt(1+1) # 위의 norm이 잘 되었음을 확인할 수 있음.

1.4142135623730951

In [107]:
print(np.linalg.norm(X, axis=1).shape)
length = np.linalg.norm(X, axis=1)

(4,)


In [108]:
length * length.T

array([2., 1., 1., 1.])

In [109]:
length.shape, length.T.reshape(1,4).shape

((4,), (1, 4))

In [110]:
length.reshape(4,1) * length.T.reshape(1,4) # 0번째가 아닌 문서들은 길이가 1이라 1이 출력됨.

array([[2.        , 1.41421356, 1.41421356, 1.41421356],
       [1.41421356, 1.        , 1.        , 1.        ],
       [1.41421356, 1.        , 1.        , 1.        ],
       [1.41421356, 1.        , 1.        , 1.        ]])

In [111]:
_X / (length.reshape(4,1) * length.T.reshape(1,4)) # cosine simmilarity

array([[1.        , 0.70710678, 0.        , 0.        ],
       [0.70710678, 1.        , 0.        , 0.        ],
       [0.        , 0.        , 1.        , 0.        ],
       [0.        , 0.        , 0.        , 1.        ]])

In [112]:
C = np.array([[1, 0, 1, 0, 0, 0],
              [0, 1, 0, 0, 0, 0],
              [1, 1, 0, 0, 0, 0],
              [1, 0, 0, 1, 1, 0],
              [0, 0, 0, 1, 0, 1]])

In [113]:
import pandas as pd

V = ['ship', 'boat', 'ocean', 'wood', 'tree']
_C = pd.DataFrame(C, index=V)
_C

Unnamed: 0,0,1,2,3,4,5
ship,1,0,1,0,0,0
boat,0,1,0,0,0,0
ocean,1,1,0,0,0,0
wood,1,0,0,1,1,0
tree,0,0,0,1,0,1


In [114]:
U, sigma, V = np.linalg.svd(C) # singular vector decomposition (U, sigma, V 세 개의 행렬로 쪼개져서 나오므로 리턴값은 3개)

In [115]:
U.shape, sigma.shape, V.shape 

((5, 5), (5,), (6, 6))

In [116]:
pd.DataFrame(V) # V.shape가 (6,6)으로 나오는 이유는 np.linalg.svd의 인자에서 full_matrices가 True이기 때문. 

Unnamed: 0,0,1,2,3,4,5
0,0.748623,0.2797116,0.2036288,0.4465631,0.325096,0.121467
1,-0.286454,-0.5284591,-0.1857612,0.6255207,0.21988,0.405641
2,-0.2797116,0.748623,-0.4465631,0.2036288,-0.121467,0.325096
3,-3.885781e-16,-1.110223e-16,0.5773503,4.996004e-16,-0.57735,0.57735
4,0.5284591,-0.286454,-0.6255207,-0.1857612,-0.405641,0.21988
5,-1.956847e-17,1.205047e-16,1.956847e-17,-0.5773503,0.57735,0.57735


In [117]:
U, sigma, V = np.linalg.svd(C, full_matrices=False) 

In [118]:
U.shape, sigma.shape, V.shape # sigma.shape는 중요한 차원을 나열한 것. 

((5, 5), (5,), (5, 6))

In [119]:
pd.DataFrame(V)

Unnamed: 0,0,1,2,3,4,5
0,0.748623,0.2797116,0.203629,0.4465631,0.325096,0.121467
1,-0.286454,-0.5284591,-0.185761,0.6255207,0.21988,0.405641
2,-0.2797116,0.748623,-0.446563,0.2036288,-0.121467,0.325096
3,-3.885781e-16,-1.110223e-16,0.57735,4.996004e-16,-0.57735,0.57735
4,0.5284591,-0.286454,-0.625521,-0.1857612,-0.405641,0.21988


In [120]:
sigma # digonal한 값들을 일렬로 쭉 늘어놓은 형태

array([2.16250096, 1.59438237, 1.27529025, 1.        , 0.39391525])

In [121]:
np.diag(sigma).shape

(5, 5)

In [122]:
np.diag(sigma[:2]).shape # 중요한 차원 2개만 쓰기

(2, 2)

In [123]:
U[:,:2].shape, V[:2].shape

((5, 2), (2, 6))

In [133]:
pd.DataFrame(U.dot(np.diag(sigma)).dot(V)) # 매트릭스를 분해했다가 합쳤음에도 아래 C와 근사한 값이 나온다. 

Unnamed: 0,0,1,2,3,4,5
0,1.0,8.331675e-16,1.0,1.013949e-16,1.986953e-16,-5.219754e-17
1,1.403886e-16,1.0,2.813274e-16,2.442866e-16,2.24209e-16,-2.297226e-16
2,1.0,1.0,-7.621517e-17,5.29094e-16,2.444524e-16,3.484147e-17
3,1.0,4.893808e-16,-8.673407e-16,1.0,1.0,4.119034e-16
4,2.859311e-16,-1.943834e-16,-9.074739000000001e-17,1.0,5.329511e-16,1.0


In [131]:
pd.DataFrame((U[:,:2].dot(np.diag(sigma[:2])).dot(V[:2]))) # 바로 윗 셀보다 차원을 더 줄인 결과
# 차원을 줄인(decomposition) 결과 필요없는 아웃라이어가 제거되고 중요한 값만 부각된다. 

Unnamed: 0,0,1,2,3,4,5
0,0.848146,0.515902,0.281625,0.12986,0.205743,-0.075882
1,0.360778,0.357508,0.155125,-0.205653,-0.025264,-0.180389
2,1.00327,0.718285,0.360778,-0.050529,0.155125,-0.205653
3,0.978006,0.12986,0.205743,1.028534,0.617139,0.411396
4,0.12986,-0.386042,-0.075882,0.898674,0.411396,0.487278


In [132]:
_C # 원본 행렬 C

Unnamed: 0,0,1,2,3,4,5
ship,1,0,1,0,0,0
boat,0,1,0,0,0,0
ocean,1,1,0,0,0,0
wood,1,0,0,1,1,0
tree,0,0,0,1,0,1


In [136]:
Ct = C.T # 원본
print(Ct.dot(Ct.T) / (np.linalg.norm(Ct, axis=1).reshape(Ct.shape[0], 1) * np.linalg.norm(Ct, axis=1).T))

[[1.         0.40824829 0.57735027 0.40824829 0.57735027 0.        ]
 [0.40824829 1.         0.         0.         0.         0.        ]
 [0.57735027 0.         1.         0.         0.         0.        ]
 [0.40824829 0.         0.         1.         0.70710678 0.70710678]
 [0.57735027 0.         0.         0.70710678 1.         0.        ]
 [0.         0.         0.         0.70710678 0.         1.        ]]


In [140]:
CC = U[:,:2].dot(np.diag(sigma[:2])).dot(V[:2]) # 차원축소
print(CC)

[[ 0.8481456   0.51590232  0.28162515  0.12986018  0.20574267 -0.07588249]
 [ 0.36077778  0.35750764  0.15512454 -0.20565325 -0.02526436 -0.18038889]
 [ 1.00327014  0.71828543  0.36077778 -0.05052871  0.15512454 -0.20565325]
 [ 0.97800578  0.12986018  0.20574267  1.0285345   0.61713858  0.41139591]
 [ 0.12986018 -0.38604214 -0.07588249  0.89867432  0.41139591  0.4872784 ]]


In [141]:
C.T.dot(C)

array([[3, 1, 1, 1, 1, 0],
       [1, 2, 0, 0, 0, 0],
       [1, 0, 1, 0, 0, 0],
       [1, 0, 0, 2, 1, 1],
       [1, 0, 0, 1, 1, 0],
       [0, 0, 0, 1, 0, 1]])

In [143]:
CCt = CC.T
print(CCt.dot(CCt.T) / (np.linalg.norm(CCt, axis=1).reshape(CCt.shape[0], 1) * np.linalg.norm(CCt, axis=1).T))

[[ 1.          0.78183738  0.9501362   0.47443204  0.74011848  0.11059579]
 [ 0.78183738  1.          0.93727576 -0.17791796  0.15937503 -0.53318971]
 [ 0.9501362   0.93727576  1.          0.17626898  0.49351152 -0.20484118]
 [ 0.47443204 -0.17791796  0.17626898  1.          0.94311169  0.92736214]
 [ 0.74011848  0.15937503  0.49351152  0.94311169  1.          0.75020516]
 [ 0.11059579 -0.53318971 -0.20484118  0.92736214  0.75020516  1.        ]]


In [146]:
# 각 열은 hidden variable인 잠재의미(클러스터)를 나타내고, 그때의 각 행(단어)은 해당 잠재의미에서의 중요도를 나타냄. 
pd.DataFrame(U[:,:2].dot(np.diag(sigma[:2])), index = ['ship', 'boat', 'ocean', 'wood', 'tree'])

Unnamed: 0,0,1
ship,0.952252,-0.472215
boat,0.279712,-0.528459
ocean,1.028335,-0.814913
wood,1.520282,0.558946
tree,0.56803,1.031162


In [147]:
data = pd.DataFrame(U[:,:2].dot(np.diag(sigma[:2])), index = ['ship', 'boat', 'ocean', 'wood', 'tree'])

In [152]:
# value를 갖고 sort하되 [0]번째를 기준으로 내림차순 정렬 수행. 이후 dict에 넣어 word_cloud에 넣기 좋은 형태로 반환
data.sort_values(by=[0], ascending=False)[0].to_dict() 

{'wood': 1.520282114638552,
 'ocean': 1.0283346514510263,
 'ship': 0.9522518499797589,
 'tree': 0.5680302646587932,
 'boat': 0.2797116034997002}

In [153]:
cluster1 = data.sort_values(by=[1], ascending = False)
cluster1[cluster1[1] > 0][1].to_dict() # 1번째 latent semantic vector에서 두 단어가 관계가 있음을 확인할 수 있다. 

{'tree': 1.031161645026428, 'wood': 0.5589464674814452}

In [154]:
# 문서 레벨에서 확인해보기
data = pd.DataFrame(np.diag(sigma[:3]).dot(V[:3]))

In [157]:
data # 이것을 시각화하면 K-means와는 또 다른 방식으로 클러스터링을 수행해 볼 수 있음. 

Unnamed: 0,0,1,2,3,4,5
0,1.618898,0.604877,0.440347,0.965693,0.70302,0.262673
1,-0.456717,-0.842566,-0.296174,0.997319,0.350572,0.646747
2,-0.356713,0.954712,-0.569498,0.259686,-0.154906,0.414592


In [205]:
clusterData = np.diag(sigma[:3]).dot(V[:3]) # 바로 윗 셀보다 차원을 더 줄인 결과
# 차원을 줄인(decomposition) 결과 필요없는 아웃라이어가 제거되고 중요한 값만 부각된다. 

In [159]:
CCt = clusterData.T
print(CCt.dot(CCt.T) / (np.linalg.norm(CCt, axis=1).reshape(CCt.shape[0], 1) * np.linalg.norm(CCt, axis=1).T))

[[ 1.          0.42223483  0.78542247  0.41805317  0.75047307 -0.01291529]
 [ 0.42223483  1.         -0.02533697 -0.00414719 -0.01597348  0.00853927]
 [ 0.78542247 -0.02533697  1.         -0.01640023  0.47162423 -0.49365825]
 [ 0.41805317 -0.00414719 -0.01640023  1.          0.87394629  0.87763513]
 [ 0.75047307 -0.01597348  0.47162423  0.87394629  1.          0.53404114]
 [-0.01291529  0.00853927 -0.49365825  0.87763513  0.53404114  1.        ]]


## 다른 예제

1. SVD => U, sigma, Vt (M,N) / (N, M), N개의 문서와 M개의 단어
2. U.sigma, U[:, :K].sigma[:K] => Latent Semantic 차원에서 어느 문서가 중요한지 찾기
3. sigma.Vt, sigma[:K].Vt[:K, :] => Latent Semantic 차원에서 어느 단어가 중요한지
4. 2번으로부터 각 문서가 어느 문서와 유사한지 Latent Semantic Dimensions에서 찾기(내적을 통해 cosine구하기)
5. 3번으로부터 각 단어가 어느 단어와 유사한지 Latent Semantic Dimensions에서 찾기(내적을 통해 cosine구하기)

In [189]:
# DTM 구조로 되어 있음. 
NewC = np.array([[1, 1, 0, 0, 0, 0, 0],
                 [0, 0, 1, 1, 1, 0, 0],
                 [0, 1, 0, 0, 0, 1, 0],
                 [0, 0, 1, 0, 0, 0, 1],
                 [0, 0, 0, 1, 1, 0, 1],
                 [1, 0, 1, 0, 1, 1, 1]])

In [190]:
RowName = ['A', 'B', 'C', 'D', 'E', 'F']
ColumnName = ['cute', 'kitty', 'eat', 'rice', 'cake', 'hamster', 'brad']
_NewC = pd.DataFrame(NewC, index=RowName, columns=ColumnName)
_NewC

Unnamed: 0,cute,kitty,eat,rice,cake,hamster,brad
A,1,1,0,0,0,0,0
B,0,0,1,1,1,0,0
C,0,1,0,0,0,1,0
D,0,0,1,0,0,0,1
E,0,0,0,1,1,0,1
F,1,0,1,0,1,1,1


In [191]:
# 1번과정 수행하기
U, sigma, V = np.linalg.svd(_NewC) # singular vector decomposition (U, sigma, V 세 개의 행렬로 쪼개져서 나오므로 리턴값은 3개)

In [192]:
U.shape, sigma.shape, V.shape

((6, 6), (6,), (7, 7))

In [193]:
pd.DataFrame(V) # V.shape가 (6,6)으로 나오는 이유는 np.linalg.svd의 인자에서 full_matrices가 True이기 때문. 

Unnamed: 0,0,1,2,3,4,5,6
0,0.2706786,0.07887421,0.4901791,0.2966641,0.5279056,0.2706786,0.4901791
1,0.4630926,0.606021,-0.07022424,-0.3810003,-0.2209182,0.4630926,-0.07022424
2,0.03630407,-0.4713829,0.3826723,-0.6078533,-0.3358578,0.03630407,0.3826723
3,-4.710277e-16,4.710277e-16,0.7071068,2.498002e-16,-1.368031e-16,-2.38061e-17,-0.7071068
4,-0.7071068,7.771561e-16,0.0,4.440892e-16,-1.110223e-16,0.7071068,5.551115e-16
5,0.3464841,-0.5598272,-0.3291731,-0.1835952,0.4428025,0.3464841,-0.3291731
6,0.3015113,-0.3015113,-2.220446e-16,0.6030227,-0.6030227,0.3015113,5.5511150000000004e-17


In [194]:
sigma # digonal한 값들을 일렬로 쭉 늘어놓은 형태

array([2.9771718 , 1.87837842, 1.35866397, 1.        , 1.        ,
       0.87302623])

In [195]:
np.diag(sigma).shape

(6, 6)

In [196]:
np.diag(sigma[:3]).shape # 중요한 차원 2개만 쓰기

(3, 3)

In [203]:
print(U[:,:3].shape, V[:2].shape)

# 2번과정 수행해보기
U[:,:3].shape, V[:3].shape

(6, 3) (2, 7)


((6, 3), (3, 7))

In [204]:
pd.DataFrame((U[:,:3].dot(np.diag(sigma[:2])).dot(V[:3])))

Unnamed: 0,0,1,2,3,4,5,6
0,0.57392,0.880565,-0.070227,-0.039169,0.094469,0.57392,-0.070227
1,0.024242,-0.039169,0.476969,0.987155,1.030981,0.024242,0.476969
2,0.57392,0.880565,-0.070227,-0.039169,0.094469,0.57392,-0.070227
3,0.228106,-0.36856,0.78329,-0.120869,0.291517,0.228106,0.78329
4,0.024242,-0.039169,0.476969,0.987155,1.030981,0.024242,0.476969
5,0.83458,0.267275,1.157155,0.087653,0.788595,0.83458,1.157155


In [198]:
pd.DataFrame(U.dot(np.diag(sigma)).dot(V)) # 매트릭스를 분해했다가 합쳤음에도 아래 C와 근사한 값이 나온다. 

ValueError: shapes (6,6) and (7,7) not aligned: 6 (dim 1) != 7 (dim 0)

In [199]:
clusterData = np.diag(sigma[:3]).dot(V[:3])

In [200]:
CCt = clusterData.T
print(CCt.dot(CCt.T) / (np.linalg.norm(CCt, axis=1).reshape(CCt.shape[0], 1) * np.linalg.norm(CCt, axis=1).T))

[[ 1.          0.72879819  0.58904298  0.02907457  0.44070595  1.
   0.58904298]
 [ 0.72879819  1.         -0.06807093 -0.04201136  0.08432534  0.72879819
  -0.06807093]
 [ 0.58904298 -0.06807093  1.          0.43665367  0.80420547  0.58904298
   1.        ]
 [ 0.02907457 -0.04201136  0.43665367  1.          0.86917695  0.02907457
   0.43665367]
 [ 0.44070595  0.08432534  0.80420547  0.86917695  1.          0.44070595
   0.80420547]
 [ 1.          0.72879819  0.58904298  0.02907457  0.44070595  1.
   0.58904298]
 [ 0.58904298 -0.06807093  1.          0.43665367  0.80420547  0.58904298
   1.        ]]
