PRACTICAL 5

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from sklearn.decomposition import TruncatedSVD
# If nltk stop word is not downloaded
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# List of documents
a1 = "She is a good girl"
a2 = "He is a good boy"
a3 = "The boy and the girl are good"
a4 = "The boy is intelligent"
a5 = "The girl is good at sports"

df = pd.DataFrame()
df["documents"] = [a1,a2,a3,a4,a5]
df.head()


Unnamed: 0,documents
0,She is a good girl
1,He is a good boy
2,The boy and the girl are good
3,The boy is intelligent
4,The girl is good at sports


In [None]:
# Preprocessing
df['clean_documents'] = df['documents'].str.replace("[^a-zA-Z#]", " ")
df['clean_documents'] = df['clean_documents'].fillna('').apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
df['clean_documents'] = df['clean_documents'].fillna('').apply(lambda x: x.lower())

df.head()

Unnamed: 0,documents,clean_documents
0,She is a good girl,she good girl
1,He is a good boy,good boy
2,The boy and the girl are good,the boy and the girl are good
3,The boy is intelligent,the boy intelligent
4,The girl is good at sports,the girl good sports


In [None]:
# TF-IDF vector
vectorizer = TfidfVectorizer(stop_words='english', smooth_idf=True)
X = vectorizer.fit_transform(df['clean_documents'])
X.toarray()


array([[0.        , 0.76524053, 0.64374446, 0.        , 0.        ],
       [0.76524053, 0.        , 0.64374446, 0.        , 0.        ],
       [0.60771799, 0.60771799, 0.51123153, 0.        , 0.        ],
       [0.55645052, 0.        , 0.        , 0.83088075, 0.        ],
       [0.        , 0.5039682 , 0.42395393, 0.        , 0.75251519]])

In [None]:
X.shape 
#A56   U(5,5). S()

(5, 5)

In [None]:
# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=2, algorithm='randomized', n_iter=100, random_state=122)
lsa = svd_model.fit_transform(X)


In [None]:
#Documents - Topic vector
pd.options.display.float_format = '{:,.16f}'.format
topic_encoded_df = pd.DataFrame(lsa, columns = ["topic_1", "topic_2"])
topic_encoded_df["documents"] = df['clean_documents']
display(topic_encoded_df[["documents", "topic_1", "topic_2"]])


Unnamed: 0,documents,topic_1,topic_2
0,she good girl,0.8366611711811012,-0.4078846681578382
1,good boy,0.7866414212292849,0.4081667993574549
2,the boy and the girl are good,0.9604770446250778,0.0711879205775066
3,the boy intelligent,0.3580019421318089,0.8135264740657662
4,the girl good sports,0.6880582107289787,-0.493327545467781


In [None]:
# Features or words used as features 
dictionary = vectorizer.get_feature_names()



In [None]:
dictionary

['boy', 'girl', 'good', 'intelligent', 'sports']

In [None]:
# Term-Topic matrix
encoding_matrix = pd.DataFrame(svd_model.components_, index = ["topic_1","topic_2"], columns = (dictionary)).T


In [None]:
encoding_matrix

Unnamed: 0,topic_1,topic_2
boy,0.4871340755171591,0.6501547104052317
girl,0.5524988175834399,-0.4162439368956839
good,0.642905234086832,-0.1388096400484909
intelligent,0.1046310584548834,0.5436972126335118
sports,0.1821281159769408,-0.2986051947633028
