# imports

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import pandas as pd
import numpy as np


In [19]:

dataset = ["The amount of polution is increasing day by day",
           "The concert was just great",
           "I love to see Gordon Ramsay cook",
           "Google is introducing a new technology",
           "AI Robots are examples of great technology present today",
           "All of us were singing in the concert",
           "We have launch campaigns to stop pollution and global warming"]

In [20]:
type(dataset), len(dataset)

(list, 7)

In [21]:
dataset = [sentence.lower() for sentence in dataset]
dataset

['the amount of polution is increasing day by day',
 'the concert was just great',
 'i love to see gordon ramsay cook',
 'google is introducing a new technology',
 'ai robots are examples of great technology present today',
 'all of us were singing in the concert',
 'we have launch campaigns to stop pollution and global warming']

In [22]:
tfid_vectorizer = TfidfVectorizer()
X = tfid_vectorizer.fit_transform(dataset)
X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 51 stored elements and shape (7, 42)>

In [23]:
print(X[0])

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 8 stored elements and shape (1, 42)>
  Coords	Values
  (0, 34)	0.22786438777524437
  (0, 2)	0.3211483974289088
  (0, 24)	0.22786438777524437
  (0, 26)	0.3211483974289088
  (0, 19)	0.2665807498646048
  (0, 17)	0.3211483974289088
  (0, 9)	0.6422967948578177
  (0, 5)	0.3211483974289088


In [24]:
X_df = pd.DataFrame(X.toarray(), columns=tfid_vectorizer.get_feature_names_out())
X_df

Unnamed: 0,ai,all,amount,and,are,by,campaigns,concert,cook,day,...,stop,technology,the,to,today,us,warming,was,we,were
0,0.0,0.0,0.321148,0.0,0.0,0.321148,0.0,0.0,0.0,0.642297,...,0.0,0.0,0.227864,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.42133,0.0,0.0,...,0.0,0.0,0.360139,0.0,0.0,0.0,0.0,0.507574,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.419257,0.0,...,0.0,0.0,0.0,0.348019,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.396717,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.356201,0.0,0.0,0.0,0.356201,0.0,0.0,0.0,0.0,0.0,...,0.0,0.295677,0.0,0.0,0.356201,0.0,0.0,0.0,0.0,0.0
5,0.0,0.386452,0.0,0.0,0.0,0.0,0.0,0.320788,0.0,0.0,...,0.0,0.0,0.274199,0.0,0.0,0.386452,0.0,0.0,0.0,0.386452
6,0.0,0.0,0.0,0.321262,0.0,0.0,0.321262,0.0,0.0,0.0,...,0.321262,0.0,0.0,0.266675,0.0,0.0,0.321262,0.0,0.321262,0.0


In [25]:
lsa = TruncatedSVD(n_components=4, random_state=100)
lsa.fit(X)

In [26]:
# this will return V transpose matrix
lsa.components_.shape

(4, 42)

In [27]:
lsa_df = pd.DataFrame(lsa.components_.tolist(), columns=tfid_vectorizer.get_feature_names_out())
lsa_df

Unnamed: 0,ai,all,amount,and,are,by,campaigns,concert,cook,day,...,stop,technology,the,to,today,us,warming,was,we,were
0,0.124192,0.1782403,0.1144608,1.984614e-16,0.124192,0.1144608,2.150601e-16,0.3449887,1.782018e-16,0.2289216,...,2.14576e-16,0.1838383,0.3760983,3.210458e-16,0.124192,0.1782403,2.137145e-16,0.2373658,2.137145e-16,0.1782403
1,1.847834e-15,-1.002068e-15,-4.41438e-17,0.2173064,2.006211e-15,5.945943e-17,0.2173064,-2.830388e-15,0.2835917,-1.492406e-17,...,0.2173064,3.753966e-15,-2.490585e-15,0.4157884,2.026531e-15,-1.132417e-15,0.2173064,-2.364458e-15,0.2173064,-1.132417e-15
2,0.1138789,-0.1444784,0.07056216,-2.114393e-15,0.1138789,0.07056216,-2.099343e-15,-0.2665524,-2.449009e-15,0.1411243,...,-2.077737e-15,0.3779181,-0.1777742,-3.748073e-15,0.1138789,-0.1444784,-2.07921e-15,-0.1766358,-2.07921e-15,-0.1444784
3,-0.2395087,0.05644665,0.2327134,9.204096e-16,-0.2395087,0.2327134,8.415462e-16,-0.04431029,5.188054e-16,0.4654268,...,8.007607e-16,-0.1793403,0.1272421,1.17353e-15,-0.2395087,0.05644665,8.061436e-16,-0.109827,8.061436e-16,0.05644665


In [28]:
features = tfid_vectorizer.get_feature_names_out()
len(features), features

(42,
 array(['ai', 'all', 'amount', 'and', 'are', 'by', 'campaigns', 'concert',
        'cook', 'day', 'examples', 'global', 'google', 'gordon', 'great',
        'have', 'in', 'increasing', 'introducing', 'is', 'just', 'launch',
        'love', 'new', 'of', 'pollution', 'polution', 'present', 'ramsay',
        'robots', 'see', 'singing', 'stop', 'technology', 'the', 'to',
        'today', 'us', 'warming', 'was', 'we', 'were'], dtype=object))

In [29]:
concepts_terms_map = {}

In [30]:
for i, component in enumerate(lsa.components_):
    # print(f"component-{i}: {component}")
    components_and_terms = zip(features, component.tolist())
    sorted_components_and_terms = sorted(components_and_terms, key= lambda x: x[1], reverse=True)
    # concepts_terms_map[f"concept-{i}", sorted_components_and_terms]
    concepts_terms_map[f"concept-{i}"] = sorted_components_and_terms[:10]


In [None]:
concepts_terms_map

{'concept-0': [('the', 0.37609829529263766),
  ('concert', 0.3449887392330661),
  ('great', 0.3001240258948738),
  ('of', 0.29579806095266686),
  ('just', 0.2373658292979123),
  ('was', 0.2373658292979123),
  ('day', 0.2289215954150452),
  ('technology', 0.18383834567413412),
  ('all', 0.17824025175628985),
  ('in', 0.17824025175628982)],
 'concept-1': [('to', 0.41578844396700665),
  ('cook', 0.28359165793510666),
  ('gordon', 0.28359165793510666),
  ('love', 0.28359165793510666),
  ('ramsay', 0.28359165793510666),
  ('see', 0.28359165793510666),
  ('and', 0.21730644711292507),
  ('campaigns', 0.21730644711292504),
  ('global', 0.21730644711292502),
  ('have', 0.21730644711292502)],
 'concept-2': [('technology', 0.37791806767144015),
  ('is', 0.3419614380631989),
  ('google', 0.34139694419097505),
  ('introducing', 0.34139694419097505),
  ('new', 0.34139694419097505),
  ('day', 0.1411243268099471),
  ('are', 0.11387892195372991),
  ('today', 0.1138789219537299),
  ('examples', 0.113878

In [32]:
for word_with_score in concepts_terms_map["concept-0"]:
    # if "the" in word_with_score:
    #     print(f"found: {word_with_score}")
    if "the" == word_with_score[0]:
        print(f"found: {word_with_score}")

found: ('the', 0.37609829529263766)
