In [None]:
#Loading the required libraries
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
## A toy example- a list of 5 documents are considered as corpus

docs=['Romeo  Juliet',
'Juliet O happy dagger!',
'Romeo died  dagger',
'Live free or die  New-Hampshire motto.',
'Did you know New-Hampshire  New-England.','Romeo in Japan']

In [None]:
## Converting text to a structured form (rows being documents and columns are the unique words)
tfidf=TfidfVectorizer()
mat= tfidf.fit_transform(docs)

In [None]:
### Mat is a sparse matrix
mat

<6x18 sparse matrix of type '<class 'numpy.float64'>'
	with 24 stored elements in Compressed Sparse Row format>

In [None]:
mat.todense()

matrix([[0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.7640961 , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.64510243, 0.        ],
        [0.5355058 , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.65304446, 0.        , 0.        ,
         0.5355058 , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        ],
        [0.55902156, 0.        , 0.        , 0.68172171, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.47196441, 0.        ],
        [0.        , 0.        , 0.39699901, 0.        , 0.        ,
         0.39699901, 0.32554487, 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.39699901, 0.39699901, 0.32554487,
         0.39699901, 0.        , 0

In [None]:
## To view the document term matrix
Data=pd.DataFrame(mat.todense(),columns=tfidf.get_feature_names_out())
Data=pd.concat([pd.Series(docs),Data],axis=1)

In [None]:
Data

Unnamed: 0,0,dagger,did,die,died,england,free,hampshire,happy,in,japan,juliet,know,live,motto,new,or,romeo,you
0,Romeo Juliet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.764096,0.0,0.0,0.0,0.0,0.0,0.645102,0.0
1,Juliet O happy dagger!,0.535506,0.0,0.0,0.0,0.0,0.0,0.0,0.653044,0.0,0.0,0.535506,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Romeo died dagger,0.559022,0.0,0.0,0.681722,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.471964,0.0
3,Live free or die New-Hampshire motto.,0.0,0.0,0.396999,0.0,0.0,0.396999,0.325545,0.0,0.0,0.0,0.0,0.0,0.396999,0.396999,0.325545,0.396999,0.0,0.0
4,Did you know New-Hampshire New-England.,0.0,0.368552,0.0,0.0,0.368552,0.0,0.302218,0.0,0.0,0.0,0.0,0.368552,0.0,0.0,0.604436,0.0,0.0,0.368552
5,Romeo in Japan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.635091,0.635091,0.0,0.0,0.0,0.0,0.0,0.0,0.439681,0.0


In [None]:
### Lets consider a test query for which relevant articles from the corpus should be given out
test='die dagger'
t1=tfidf.transform([test])

In [None]:
t1.todense()

matrix([[0.6340862 , 0.        , 0.77326237, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        ]])

In [None]:
sim1=cosine_similarity(np.asarray(t1.todense()),np.asarray(mat.todense()))
sim1

array([[0.        , 0.33955684, 0.35446786, 0.3069844 , 0.        ,
        0.        ]])

In [None]:
## To understand the hidden relations we do matrix factorization (SVD)- we get 3 entities that expose relations
## 1. Documents- Topics matrix, topics-topics strength and word-topics

## Since singular values are eigen values which captures information- lower eigen values implies lesser information so we consider only top n eigen values
## In this case we considered 2 eigen values
tsvt=TruncatedSVD(n_components=2)

In [None]:
tsvt

TruncatedSVD()

In [None]:
### Here we perform the matrix factorization on document term matrix, to explore the relationships
lsa=tsvt.fit_transform(mat) ## lsa will have (as mentioned earlier) topics strength, documents to topics relation and words to topics relation

In [None]:
## By default the output here reveals documents to topics i.e what topics are talked about in the docs
lsa

array([[ 7.91975656e-01, -7.43150395e-17],
       [ 6.77369665e-01, -8.99237022e-16],
       [ 6.90765217e-01,  1.85524541e-15],
       [-1.50281571e-16,  8.04722430e-01],
       [-2.53627383e-16,  8.04722430e-01],
       [ 4.69548168e-01, -8.08147584e-16]])

In [None]:
from IPython.core.display import display
## A better representation of the  above one
Data=pd.DataFrame(lsa,columns=['Topic1','Topic2'])
Data['Original']=docs
display(Data[['Original',"Topic1","Topic2"]])

Unnamed: 0,Original,Topic1,Topic2
0,Romeo Juliet,0.7919757,-7.431504000000001e-17
1,Juliet O happy dagger!,0.6773697,-8.99237e-16
2,Romeo died dagger,0.6907652,1.855245e-15
3,Live free or die New-Hampshire motto.,-1.502816e-16,0.8047224
4,Did you know New-Hampshire New-England.,-2.536274e-16,0.8047224
5,Romeo in Japan,0.4695482,-8.081476e-16


- Observe that the documents related to Romeo etc have a high value for topic 1 and new-hampshire has highr value for topic 2

In [None]:
dictionary=tfidf.get_feature_names_out()
dictionary

array(['dagger', 'did', 'die', 'died', 'england', 'free', 'hampshire',
       'happy', 'in', 'japan', 'juliet', 'know', 'live', 'motto', 'new',
       'or', 'romeo', 'you'], dtype=object)

In [None]:
## Observe that there are 18 words and now we have here topics to words relations
print(tsvt.components_)
print(tsvt.components_.shape)

[[ 4.19853910e-01 -7.45982907e-17 -6.95310927e-17  2.64009102e-01
  -1.09771327e-16 -3.27773418e-17 -8.20095743e-17  2.47998930e-01
   1.67184969e-01  1.67184969e-01  5.42629287e-01 -1.09530544e-16
  -3.28576028e-17 -3.28576028e-17 -1.34648473e-16 -3.28576028e-17
   5.84953201e-01 -1.06196271e-16]
 [ 5.30895740e-16  2.28993171e-01  2.46668290e-01  1.75476841e-15
   2.28993171e-01  2.46668290e-01  3.90049203e-01 -1.20127943e-15
  -9.01896346e-16 -9.01896346e-16 -7.45175663e-16  2.28993171e-01
   2.46668290e-01  2.46668290e-01  5.77826879e-01  2.46668290e-01
   7.67429720e-16  2.28993171e-01]]
(2, 18)


In [None]:
### To have a better representation of which word is more into which topic
encoding_matrix=pd.DataFrame(tsvt.components_,index=['Topic1','Topic2'],columns=dictionary).T
encoding_matrix

Unnamed: 0,Topic1,Topic2
dagger,0.4198539,5.308957e-16
did,-7.459829000000001e-17,0.2289932
die,-6.953109000000001e-17,0.2466683
died,0.2640091,1.754768e-15
england,-1.097713e-16,0.2289932
free,-3.2777340000000004e-17,0.2466683
hampshire,-8.200957e-17,0.3900492
happy,0.2479989,-1.201279e-15
in,0.167185,-9.018963e-16
japan,0.167185,-9.018963e-16


- Observe that dagger, happy, die, romeo etc all are having topic 1 strength and motto, live etc have strong topic 2

In [None]:
## We have a search query "die dagger", we have converted into a document term matrix.
t1.todense()

matrix([[0.6340862 , 0.        , 0.77326237, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        ]])

In [None]:
t1.todense().shape

(1, 18)

In [None]:
## But to retrieve the relevant documents (as per the topics they constitute), the test query should also be transformed into lsa space
## Remember, the documents in lsa space have shape of 5 X 2 - each document, topic strength relation (from cell number 13)
query_vec= tsvt.transform(t1)
query_vec

array([[0.26622357, 0.19073931]])

In [None]:

##A calculation to show how the document topics matrix is created
## The document term matrix is multiplied with
doc_trans= np.dot(mat.todense(),tsvt.components_.T)
doc_trans

matrix([[ 7.91975656e-01, -7.43150395e-17],
        [ 6.77369665e-01, -8.99237022e-16],
        [ 6.90765217e-01,  1.85524541e-15],
        [-1.50281571e-16,  8.04722430e-01],
        [-2.53627383e-16,  8.04722430e-01],
        [ 4.69548168e-01, -8.08147584e-16]])

In [None]:
## Now get the cosine similarity of query to all the documents in the document-topic space
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
sim=cosine_similarity(np.asarray(query_vec),np.asarray(doc_trans))

In [None]:
## To make the array one dimensional
sim=sim.reshape(-1)
sim

array([0.81289556, 0.81289556, 0.81289556, 0.58240949, 0.58240949,
       0.81289556])

In [None]:
pd.concat([Data['Original'],pd.Series(sim)],axis=1)

Unnamed: 0,Original,0
0,Romeo Juliet,0.812896
1,Juliet O happy dagger!,0.812896
2,Romeo died dagger,0.812896
3,Live free or die New-Hampshire motto.,0.582409
4,Did you know New-Hampshire New-England.,0.582409
5,Romeo in Japan,0.812896


- Observe that when 'die dagger' is given as a query, the document "Romeo and juliet" is also found to be similar as die and romeo are connected someother document

In [None]:
# Singular-value decomposition
from numpy import array
from scipy.linalg import svd
# define a matrix
A = array([[1,2], [3,4], [5,6]])
print(A)
# SVD
U, s, VT = svd(A)
print(U)
print(s)
print(VT)
print("<---------------------------->")
# print(np.dot(VT,np.dot(U,s)))

[[1 2]
 [3 4]
 [5 6]]
[[-0.2298477   0.88346102  0.40824829]
 [-0.52474482  0.24078249 -0.81649658]
 [-0.81964194 -0.40189603  0.40824829]]
[9.52551809 0.51430058]
[[-0.61962948 -0.78489445]
 [-0.78489445  0.61962948]]
<---------------------------->
