# imports

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import pandas as pd
import numpy as np


In [2]:

dataset = ["The amount of polution is increasing day by day",
           "The concert was just great",
           "I love to see Gordon Ramsay cook",
           "Google is introducing a new technology",
           "AI Robots are examples of great technology present today",
           "All of us were singing in the concert",
           "We have launch campaigns to stop pollution and global warming"]

In [3]:
# type(dataset), len(dataset)

In [4]:
dataset = [sentence.lower() for sentence in dataset]
dataset

['the amount of polution is increasing day by day',
 'the concert was just great',
 'i love to see gordon ramsay cook',
 'google is introducing a new technology',
 'ai robots are examples of great technology present today',
 'all of us were singing in the concert',
 'we have launch campaigns to stop pollution and global warming']

In [5]:
tfid_vectorizer = TfidfVectorizer()
X = tfid_vectorizer.fit_transform(dataset)
X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 51 stored elements and shape (7, 42)>

In [6]:
print(X[0])

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 8 stored elements and shape (1, 42)>
  Coords	Values
  (0, 34)	0.22786438777524437
  (0, 2)	0.3211483974289088
  (0, 24)	0.22786438777524437
  (0, 26)	0.3211483974289088
  (0, 19)	0.2665807498646048
  (0, 17)	0.3211483974289088
  (0, 9)	0.6422967948578177
  (0, 5)	0.3211483974289088


In [7]:
X_df = pd.DataFrame(X.toarray(), columns=tfid_vectorizer.get_feature_names_out())
X_df

Unnamed: 0,ai,all,amount,and,are,by,campaigns,concert,cook,day,...,stop,technology,the,to,today,us,warming,was,we,were
0,0.0,0.0,0.321148,0.0,0.0,0.321148,0.0,0.0,0.0,0.642297,...,0.0,0.0,0.227864,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.42133,0.0,0.0,...,0.0,0.0,0.360139,0.0,0.0,0.0,0.0,0.507574,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.419257,0.0,...,0.0,0.0,0.0,0.348019,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.396717,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.356201,0.0,0.0,0.0,0.356201,0.0,0.0,0.0,0.0,0.0,...,0.0,0.295677,0.0,0.0,0.356201,0.0,0.0,0.0,0.0,0.0
5,0.0,0.386452,0.0,0.0,0.0,0.0,0.0,0.320788,0.0,0.0,...,0.0,0.0,0.274199,0.0,0.0,0.386452,0.0,0.0,0.0,0.386452
6,0.0,0.0,0.0,0.321262,0.0,0.0,0.321262,0.0,0.0,0.0,...,0.321262,0.0,0.0,0.266675,0.0,0.0,0.321262,0.0,0.321262,0.0


In [8]:
lsa = TruncatedSVD(n_components=4, random_state=100)
lsa.fit(X)

In [17]:
# this will return V transpose matrix
lsa.components_.shape

(4, 42)

In [10]:
lsa_df = pd.DataFrame(lsa.components_.tolist(), columns=tfid_vectorizer.get_feature_names_out())
lsa_df

Unnamed: 0,ai,all,amount,and,are,by,campaigns,concert,cook,day,...,stop,technology,the,to,today,us,warming,was,we,were
0,0.124192,0.1782403,0.1144608,-5.896217e-17,0.124192,0.1144608,-4.3206400000000005e-17,0.3449887,4.298959e-17,0.2289216,...,-4.2986360000000005e-17,0.1838383,0.3760983,-6.98946e-18,0.124192,0.1782403,-4.381904e-17,0.2373658,-4.381904e-17,0.1782403
1,1.270754e-15,-4.345329e-16,-2.7980110000000005e-17,0.2173064,1.493809e-15,6.769058e-17,0.2173064,-1.607868e-15,0.2835917,-2.7210990000000003e-17,...,0.2173064,2.212644e-15,-1.466085e-15,0.4157884,1.536436e-15,-5.723171e-16,0.2173064,-1.454029e-15,0.2173064,-5.723171e-16
2,0.1138789,-0.1444784,0.07056216,-1.003492e-15,0.1138789,0.07056216,-9.511218e-16,-0.2665524,-1.72962e-15,0.1411243,...,-9.609429e-16,0.3779181,-0.1777742,-2.230755e-15,0.1138789,-0.1444784,-9.633982e-16,-0.1766358,-9.633982e-16,-0.1444784
3,-0.2395087,0.05644665,0.2327134,7.979453e-16,-0.2395087,0.2327134,6.477668e-16,-0.04431029,3.388269e-16,0.4654268,...,6.663057e-16,-0.1793403,0.1272421,9.049627e-16,-0.2395087,0.05644665,6.874203e-16,-0.109827,6.874203e-16,0.05644665


In [11]:
features = tfid_vectorizer.get_feature_names_out()
len(features), features

(42,
 array(['ai', 'all', 'amount', 'and', 'are', 'by', 'campaigns', 'concert',
        'cook', 'day', 'examples', 'global', 'google', 'gordon', 'great',
        'have', 'in', 'increasing', 'introducing', 'is', 'just', 'launch',
        'love', 'new', 'of', 'pollution', 'polution', 'present', 'ramsay',
        'robots', 'see', 'singing', 'stop', 'technology', 'the', 'to',
        'today', 'us', 'warming', 'was', 'we', 'were'], dtype=object))

In [12]:
concepts_terms_map = {}

In [13]:
for i, component in enumerate(lsa.components_):
    # print(f"component-{i}: {component}")
    components_and_terms = zip(features, component.tolist())
    sorted_components_and_terms = sorted(components_and_terms, key= lambda x: x[1], reverse=True)
    # concepts_terms_map[f"concept-{i}", sorted_components_and_terms]
    concepts_terms_map[f"concept-{i}"] = sorted_components_and_terms[:10]


In [14]:
concepts_terms_map

{'concept-0': [('the', 0.3760982952926373),
  ('concert', 0.3449887392330659),
  ('great', 0.30012402589487386),
  ('of', 0.29579806095266653),
  ('just', 0.23736582929791236),
  ('was', 0.23736582929791236),
  ('day', 0.22892159541504484),
  ('technology', 0.18383834567413426),
  ('all', 0.17824025175628966),
  ('in', 0.17824025175628963)],
 'concept-1': [('to', 0.41578844396700665),
  ('cook', 0.283591657935107),
  ('gordon', 0.283591657935107),
  ('love', 0.283591657935107),
  ('ramsay', 0.283591657935107),
  ('see', 0.283591657935107),
  ('and', 0.2173064471129247),
  ('global', 0.21730644711292466),
  ('have', 0.21730644711292466),
  ('launch', 0.21730644711292466)],
 'concept-2': [('technology', 0.37791806767144065),
  ('is', 0.3419614380631979),
  ('google', 0.3413969441909745),
  ('introducing', 0.3413969441909745),
  ('new', 0.3413969441909745),
  ('day', 0.14112432680994577),
  ('are', 0.11387892195373112),
  ('examples', 0.11387892195373109),
  ('present', 0.1138789219537310

In [15]:
for concept in concepts_terms_map.keys():
    documents_with_scores = []
    concept_words_and_scores = concepts_terms_map[concept]
    for document in dataset:
        score = 0
        words = document.split(sep=" ")     
        for word in words:
            for concept_word_and_score in concept_words_and_scores:
                if word == concept_word_and_score[0]:
                    score += concept_word_and_score[1]
        documents_with_scores.append(score)
    print(f"{concept}: {documents_with_scores}")


concept-0: [1.1297395470753935, 1.4959427190164019, 0, 0.18383834567413426, 0.7797604325216746, 1.373365598990949, 0]
concept-1: [0, 0, 1.8337467336425415, 0, 0, 0, 1.2850142324187053]
concept-2: [0.6242100916830895, 0, 0, 1.7440703383075622, 0.833433755486365, 0, 0]
concept-3: [2.2015937554478864, 0.12724213180694283, 0, 0.21264455202450191, 0, 0.2965820743887401, 0]
