## Practical 5 - Implementing LSA and Topic model.

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from sklearn.decomposition import TruncatedSVD
# If nltk stop word is not downloaded
nltk.download('stopwords')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dhrumin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [41]:
data = pd.read_csv('Homo Deus.csv',nrows=20)
df = pd.DataFrame(data)

df['documents'] = df['Review']
df = df.drop(['Name','Rating'],axis=1)
df

Unnamed: 0,Review,documents
0,Boy am I glad I am living this day and age rat...,Boy am I glad I am living this day and age rat...
1,Enough is said about the book and the author.....,Enough is said about the book and the author.....
2,This is the most important book i read in last...,This is the most important book i read in last...
3,Same as it's predecessor this book also discus...,Same as it's predecessor this book also discus...
4,A must read in a lifetime !I took quite someti...,A must read in a lifetime !I took quite someti...
5,Books : Sapiens and Homo Deus are certainly ey...,Books : Sapiens and Homo Deus are certainly ey...
6,Thought provoking. Amazing way of raising cont...,Thought provoking. Amazing way of raising cont...
7,An excellent account of some of the possible a...,An excellent account of some of the possible a...
8,Brilliant !. Another great book after Sapiens....,Brilliant !. Another great book after Sapiens....
9,Well this book will widen your perspective muc...,Well this book will widen your perspective muc...


In [15]:
# List of documents
a1 = "He is a good dog."
a2 = "The dog is too lazy."
a3 = "That is a brown cat."
a4 = "The cat is very active."
a5 = "I have brown cat and dog."

df = pd.DataFrame()
df["documents"] = [a1,a2,a3,a4,a5]
df.head()


Unnamed: 0,documents
0,He is a good dog.
1,The dog is too lazy.
2,That is a brown cat.
3,The cat is very active.
4,I have brown cat and dog.


In [42]:
# Preprocessing
df['clean_documents'] = df['documents'].str.replace("[^a-zA-Z#]", " ")
df['clean_documents'] = df['clean_documents'].fillna('').apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
df['clean_documents'] = df['clean_documents'].fillna('').apply(lambda x: x.lower())

df.head()

Unnamed: 0,Review,documents,clean_documents
0,Boy am I glad I am living this day and age rat...,Boy am I glad I am living this day and age rat...,boy glad living this day and age rather than y...
1,Enough is said about the book and the author.....,Enough is said about the book and the author.....,enough said about the book and the author all ...
2,This is the most important book i read in last...,This is the most important book i read in last...,this the most important book read last five ye...
3,Same as it's predecessor this book also discus...,Same as it's predecessor this book also discus...,same predecessor this book also discusses the ...
4,A must read in a lifetime !I took quite someti...,A must read in a lifetime !I took quite someti...,must read lifetime took quite sometime read th...


In [50]:
from nltk.corpus import stopwords
# tokenization
tokenized_doc = df['clean_documents'].fillna('').apply(lambda x: x.split())

# remove stop-words

tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stopwords.words('english')])

# de-tokenization
detokenized_doc = []
for i in range(len(df)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

df['clean_documents'] = detokenized_doc


In [51]:
df.head()

Unnamed: 0,Review,documents,clean_documents
0,Boy am I glad I am living this day and age rat...,Boy am I glad I am living this day and age rat...,boy glad living day age rather years coinciden...
1,Enough is said about the book and the author.....,Enough is said about the book and the author.....,enough said book author tell imagination creat...
2,This is the most important book i read in last...,This is the most important book i read in last...,important book read last five years traces con...
3,Same as it's predecessor this book also discus...,Same as it's predecessor this book also discus...,predecessor book also discusses workings logic...
4,A must read in a lifetime !I took quite someti...,A must read in a lifetime !I took quite someti...,must read lifetime took quite sometime read in...


In [52]:
# TF-IDF vector
vectorizer = TfidfVectorizer(stop_words='english', smooth_idf=True)
X = vectorizer.fit_transform(df['clean_documents'])
X.toarray()


array([[0.12901236, 0.        , 0.        , ..., 0.08090004, 0.09190604,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.14305868,
        0.        ],
       ...,
       [0.        , 0.        , 0.11940667, ..., 0.07487658, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.2996879 , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [54]:
X.shape 
# A56   U(5,5). S()

(20, 247)

In [55]:
# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=2, algorithm='randomized', n_iter=100, random_state=122)
lsa = svd_model.fit_transform(X)


In [56]:
#Documents - Topic vector
pd.options.display.float_format = '{:,.16f}'.format
topic_encoded_df = pd.DataFrame(lsa, columns = ["topic_1", "topic_2"])
topic_encoded_df["documents"] = df['clean_documents']
display(topic_encoded_df[["documents", "topic_1", "topic_2"]])


Unnamed: 0,documents,topic_1,topic_2
0,boy glad living day age rather years coinciden...,0.3981256762048849,0.6071657987895401
1,enough said book author tell imagination creat...,0.3046843686696321,-0.3064528505157263
2,important book read last five years traces con...,0.4340933394847125,-0.2452355154812578
3,predecessor book also discusses workings logic...,0.5234837166997298,-0.2959380524831141
4,must read lifetime took quite sometime read in...,0.3508617181115348,-0.16248876057019
5,books sapiens homo deus certainly eye openers ...,0.3109437675653213,0.5586301837503166
6,thought provoking amazing way raising controve...,0.2733668456164287,0.2041781165484859
7,excellent account possible plausible scenarios...,0.4113174200673417,-0.3002215311670962
8,brilliant another great book sapiens excellent...,0.4938932434627802,-0.0831711147592769
9,well book widen perspective much like homo sap...,0.4233403299919939,0.2482020455975403


In [57]:
# Features or words used as features 
dictionary = vectorizer.get_feature_names()

In [58]:
dictionary

['accelerating',
 'according',
 'account',
 'added',
 'advances',
 'advise',
 'age',
 'algorithm',
 'amazing',
 'analysis',
 'answer',
 'articles',
 'articulately',
 'artificial',
 'asks',
 'atheist',
 'author',
 'away',
 'babies',
 'based',
 'behavior',
 'better',
 'bible',
 'bio',
 'black',
 'blows',
 'book',
 'books',
 'bow',
 'boy',
 'brain',
 'breadth',
 'brilliant',
 'buy',
 'came',
 'canvas',
 'certainly',
 'challenged',
 'cheers',
 'children',
 'chilled',
 'choose',
 'clarity',
 'clear',
 'clumps',
 'coincidence',
 'combined',
 'coming',
 'completely',
 'concepts',
 'conferences',
 'confused',
 'confusion',
 'contemporary',
 'controversial',
 'convincingly',
 'copy',
 'created',
 'creative',
 'dark',
 'data',
 'day',
 'deathwish',
 'decrbed',
 'definition',
 'delivery',
 'designer',
 'deus',
 'different',
 'discuss',
 'discusses',
 'dissolving',
 'disturbing',
 'dont',
 'driven',
 'easy',
 'end',
 'energy',
 'engineering',
 'enjoyed',
 'enriched',
 'evolution',
 'excellent',
 '

In [59]:
# Term-Topic matrix
encoding_matrix = pd.DataFrame(svd_model.components_, index = ["topic_1","topic_2"], columns = (dictionary)).T


In [60]:
encoding_matrix

Unnamed: 0,topic_1,topic_2
accelerating,0.0320887645936997,0.0681682105192582
according,0.0792611445968919,0.0647318226529338
account,0.0306836622782651,-0.0311970630042577
added,0.0391652019610703,-0.0548726134674279
advances,0.0306836622782651,-0.0311970630042577
...,...,...
work,0.0276126617896151,0.0287285303036194
workings,0.0847435629372862,-0.0667338432917115
world,0.0827310620441177,0.1063978525443479
years,0.0951514753118973,-0.0035769135759157
