In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

  from collections import Sequence


### Sample Dataset

In [2]:
dataset = ["The amount of pollution is increasing day by day",
           "The concert was just great",
           "I love to see Gordon Ramsay cook",
           "Google is introducing a new technology",
           "AI Robots are examples of great technology present today",
           "All of us were singing in the concert",
           "We have launch campaigns to stop pollution and global warming"]

Lower-casing the dataset

In [3]:
dataset = [line.lower() for line in dataset]

### Creating TF-IDF Model

In [4]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(dataset)

In [5]:
tfidf_matrix.shape

(7, 41)

In [6]:
vectorizer.get_feature_names()

['ai',
 'all',
 'amount',
 'and',
 'are',
 'by',
 'campaigns',
 'concert',
 'cook',
 'day',
 'examples',
 'global',
 'google',
 'gordon',
 'great',
 'have',
 'in',
 'increasing',
 'introducing',
 'is',
 'just',
 'launch',
 'love',
 'new',
 'of',
 'pollution',
 'present',
 'ramsay',
 'robots',
 'see',
 'singing',
 'stop',
 'technology',
 'the',
 'to',
 'today',
 'us',
 'warming',
 'was',
 'we',
 'were']

In [7]:
tfidf_matrix[0].toarray()

array([[0.        , 0.        , 0.32642545, 0.        , 0.        ,
        0.32642545, 0.        , 0.        , 0.        , 0.65285089,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.32642545, 0.        , 0.27096115,
        0.        , 0.        , 0.        , 0.        , 0.23160861,
        0.27096115, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.23160861, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ]])

In [8]:
print(tfidf_matrix[0])

  (0, 33)	0.231608612212116
  (0, 2)	0.326425447033964
  (0, 24)	0.231608612212116
  (0, 25)	0.270961154226111
  (0, 19)	0.270961154226111
  (0, 17)	0.326425447033964
  (0, 9)	0.652850894067928
  (0, 5)	0.326425447033964


### Creating the SVD

In [9]:
lsa = TruncatedSVD(n_components = 4, n_iter = 100) # n_components => number of concepts you wanna find in the data
lsa.fit(tfidf_matrix)

TruncatedSVD(algorithm='randomized', n_components=4, n_iter=100,
       random_state=None, tol=0.0)

In [10]:
lsa.components_.shape

(4, 41)

In [11]:
lsa.components_[0]

array([0.12230514, 0.17593921, 0.11986219, 0.02391177, 0.12230514,
       0.11986219, 0.02391177, 0.33986475, 0.00740591, 0.23972439,
       0.12230514, 0.02391177, 0.09746251, 0.00740591, 0.29534387,
       0.02391177, 0.17593921, 0.11986219, 0.09746251, 0.18039817,
       0.23349403, 0.02391177, 0.00740591, 0.09746251, 0.29665909,
       0.11934473, 0.12230514, 0.00740591, 0.12230514, 0.00740591,
       0.17593921, 0.02391177, 0.18242602, 0.37555093, 0.02599636,
       0.12230514, 0.17593921, 0.02391177, 0.23349403, 0.02391177,
       0.17593921])

### Visualizing the concepts

In [12]:
import pandas as pd

In [13]:
df = pd.DataFrame(lsa.components_, index=['Concept ' + str(i+1) for i in range(4)] , columns=vectorizer.get_feature_names())

In [14]:
df.transpose()

Unnamed: 0,Concept 1,Concept 2,Concept 3,Concept 4
ai,0.122305,0.002874,0.125254,-0.195291
all,0.175939,-0.079465,-0.124206,0.040094
amount,0.119862,0.082287,0.036614,0.196403
and,0.023912,0.203802,-0.086381,0.064054
are,0.122305,0.002874,0.125254,-0.195291
by,0.119862,0.082287,0.036614,0.196403
campaigns,0.023912,0.203802,-0.086381,0.064054
concert,0.339865,-0.165508,-0.218994,-0.042342
cook,0.007406,0.223737,-0.130714,-0.226291
day,0.239724,0.164575,0.073227,0.392806


In [15]:
terms = vectorizer.get_feature_names()

# Word Concept Dictionary Creation
concept_words = {}

for i, comp in enumerate(lsa.components_):
    componentTerms = zip(terms, comp)
    sortedTerms = sorted(componentTerms, key=lambda x: x[1], reverse=True)
    sortedTerms = sortedTerms[:10] # grabbing the first 10 elements
    
    print('\nConcept ' + str(i+1) + ':')
    concept_words["Concept " + str(i+1)] = sortedTerms
    
    for term in sortedTerms:
        print(term)


Concept 1:
('the', 0.375550932899053)
('concert', 0.3398647517392532)
('of', 0.29665909350149083)
('great', 0.29534386566053683)
('day', 0.23972438835590149)
('just', 0.23349402804713223)
('was', 0.23349402804713223)
('technology', 0.18242602421743503)
('is', 0.1803981689876796)
('all', 0.1759392127966662)

Concept 2:
('to', 0.3548941474782012)
('pollution', 0.23747906096178822)
('cook', 0.2237367705446092)
('gordon', 0.2237367705446092)
('love', 0.2237367705446092)
('ramsay', 0.2237367705446092)
('see', 0.2237367705446092)
('and', 0.20380230261581192)
('campaigns', 0.20380230261581192)
('global', 0.20380230261581192)

Concept 3:
('technology', 0.3681751594120383)
('google', 0.31828506931586975)
('introducing', 0.31828506931586975)
('new', 0.31828506931586975)
('is', 0.29459629228393713)
('are', 0.12525356758256406)
('examples', 0.12525356758256406)
('present', 0.12525356758256406)
('robots', 0.12525356758256406)
('today', 0.12525356758256406)

Concept 4:
('day', 0.3928059228972394)
(

### Sentence Concepts

In [16]:
import nltk

In [17]:
for concept_key in concept_words.keys():
    
    print('\n' + concept_key + ':')
    
    for sentence in dataset:
        words = nltk.word_tokenize(sentence)
        score = 0
        for word in words:
            for word_with_score in concept_words[concept_key]:
                if word == word_with_score[0]:
                    score += word_with_score[1]
        
        print('\t- ' + sentence + ' => ' + str(score))


Concept 1:
	- the amount of pollution is increasing day by day => 1.3320569721000264
	- the concert was just great => 1.4777476063931076
	- i love to see gordon ramsay cook => 0
	- google is introducing a new technology => 0.3628241932051146
	- ai robots are examples of great technology present today => 0.7744289833794628
	- all of us were singing in the concert => 1.1880139909364633
	- we have launch campaigns to stop pollution and global warming => 0

Concept 2:
	- the amount of pollution is increasing day by day => 0.23747906096178822
	- the concert was just great => 0
	- i love to see gordon ramsay cook => 1.473578000201247
	- google is introducing a new technology => 0
	- ai robots are examples of great technology present today => 0
	- all of us were singing in the concert => 0
	- we have launch campaigns to stop pollution and global warming => 1.203780116287425

Concept 3:
	- the amount of pollution is increasing day by day => 0.29459629228393713
	- the concert was just great =>