In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

  from collections import Sequence


### Sample Dataset

In [2]:
dataset = ["The amount of polution is increasing day by day",
           "The concert was just great",
           "I love to see Gordon Ramsay cook",
           "Google is introducing a new technology",
           "AI Robots are examples of great technology present today",
           "All of us were singing in the concert",
           "We have launch campaigns to stop pollution and global warming"]

Lower-casing the dataset

In [3]:
dataset = [line.lower() for line in dataset]

### Creating TF-IDF Model

In [20]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(dataset)

In [21]:
tfidf_matrix.shape

(7, 42)

In [43]:
vectorizer.get_feature_names()

['ai',
 'all',
 'amount',
 'and',
 'are',
 'by',
 'campaigns',
 'concert',
 'cook',
 'day',
 'examples',
 'global',
 'google',
 'gordon',
 'great',
 'have',
 'in',
 'increasing',
 'introducing',
 'is',
 'just',
 'launch',
 'love',
 'new',
 'of',
 'pollution',
 'polution',
 'present',
 'ramsay',
 'robots',
 'see',
 'singing',
 'stop',
 'technology',
 'the',
 'to',
 'today',
 'us',
 'warming',
 'was',
 'we',
 'were']

In [34]:
tfidf_matrix[0].toarray()

array([[0.        , 0.        , 0.3211484 , 0.        , 0.        ,
        0.3211484 , 0.        , 0.        , 0.        , 0.64229679,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.3211484 , 0.        , 0.26658075,
        0.        , 0.        , 0.        , 0.        , 0.22786439,
        0.        , 0.3211484 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.22786439,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ]])

In [35]:
print(tfidf_matrix[0])

  (0, 34)	0.22786438777524437
  (0, 2)	0.3211483974289088
  (0, 24)	0.22786438777524437
  (0, 26)	0.3211483974289088
  (0, 19)	0.2665807498646048
  (0, 17)	0.3211483974289088
  (0, 9)	0.6422967948578177
  (0, 5)	0.3211483974289088


### Creating the SVD

In [44]:
lsa = TruncatedSVD(n_components = 4, n_iter = 100) # n_components => number of concepts you wanna find in the data
lsa.fit(tfidf_matrix)

TruncatedSVD(algorithm='randomized', n_components=4, n_iter=100,
       random_state=None, tol=0.0)

In [46]:
lsa.components_.shape

(4, 42)

In [47]:
lsa.components_[0]

array([ 1.24191973e-01,  1.78240252e-01,  1.14460798e-01, -1.35434810e-16,
        1.24191973e-01,  1.14460798e-01, -1.35434810e-16,  3.44988739e-01,
       -2.19714842e-16,  2.28921595e-01,  1.24191973e-01, -1.35434810e-16,
        9.72770950e-02, -2.19714842e-16,  3.00124026e-01, -1.35434810e-16,
        1.78240252e-01,  1.14460798e-01,  9.72770950e-02,  1.75760635e-01,
        2.37365829e-01, -1.35434810e-16, -2.19714842e-16,  9.72770950e-02,
        2.95798061e-01, -1.35434810e-16,  1.14460798e-01,  1.24191973e-01,
       -2.19714842e-16,  1.24191973e-01, -2.19714842e-16,  1.78240252e-01,
       -1.35434810e-16,  1.83838346e-01,  3.76098295e-01, -2.94804711e-16,
        1.24191973e-01,  1.78240252e-01, -1.35434810e-16,  2.37365829e-01,
       -1.35434810e-16,  1.78240252e-01])

### Visualizing the concepts

In [48]:
import pandas as pd

In [56]:
df = pd.DataFrame(lsa.components_, index=['Concept ' + str(i+1) for i in range(4)] , columns=vectorizer.get_feature_names())

In [57]:
df.transpose()

Unnamed: 0,Concept 1,Concept 2,Concept 3,Concept 4
ai,0.124192,-2.57098e-15,0.1138789,-0.2395087
all,0.1782403,1.355582e-15,-0.1444784,0.05644665
amount,0.1144608,5.4131120000000006e-17,0.07056216,0.2327134
and,-1.354348e-16,0.2173064,1.747304e-15,-5.990493e-16
are,0.124192,-2.512696e-15,0.1138789,-0.2395087
by,0.1144608,1.8515020000000003e-17,0.07056216,0.2327134
campaigns,-1.354348e-16,0.2173064,1.747304e-15,-7.655258e-16
concert,0.3449887,3.029444e-15,-0.2665524,-0.04431029
cook,-2.197148e-16,0.2835917,2.812912e-15,-1.132927e-15
day,0.2289216,3.7030040000000006e-17,0.1411243,0.4654268


In [64]:
terms = vectorizer.get_feature_names()

# Word Concept Dictionary Creation
concept_words = {}

for i, comp in enumerate(lsa.components_):
    componentTerms = zip(terms, comp)
    sortedTerms = sorted(componentTerms, key=lambda x: x[1], reverse=True)
    sortedTerms = sortedTerms[:10] # grabbing the first 10 elements
    
    print('\nConcept ' + str(i+1) + ':')
    concept_words["Concept " + str(i+1)] = sortedTerms
    
    for term in sortedTerms:
        print(term)


Concept 1:
('the', 0.37609829529263716)
('concert', 0.34498873923306556)
('great', 0.3001240258948739)
('of', 0.29579806095266703)
('just', 0.2373658292979119)
('was', 0.2373658292979119)
('day', 0.22892159541504514)
('technology', 0.18383834567413462)
('all', 0.17824025175628955)
('in', 0.17824025175628955)

Concept 2:
('to', 0.4157884439670072)
('cook', 0.2835916579351083)
('gordon', 0.2835916579351083)
('love', 0.2835916579351083)
('ramsay', 0.2835916579351083)
('see', 0.2835916579351083)
('and', 0.2173064471129239)
('campaigns', 0.2173064471129239)
('global', 0.2173064471129239)
('have', 0.2173064471129239)

Concept 3:
('technology', 0.3779180676714398)
('is', 0.3419614380631986)
('google', 0.3413969441909746)
('introducing', 0.3413969441909746)
('new', 0.3413969441909746)
('day', 0.14112432680994738)
('are', 0.11387892195373016)
('examples', 0.11387892195373016)
('present', 0.11387892195373016)
('robots', 0.11387892195373016)

Concept 4:
('day', 0.46542676790411147)
('amount', 0.

### Sentence Concepts

In [69]:
import nltk

In [85]:
for concept_key in concept_words.keys():
    
    print('\n' + concept_key + ':')
    
    for sentence in dataset:
        words = nltk.word_tokenize(sentence)
        score = 0
        for word in words:
            for word_with_score in concept_words[concept_key]:
                if word == word_with_score[0]:
                    score += word_with_score[1]
        
        print('\t- ' + sentence + ' => ' + str(score))


Concept 1:
	- the amount of polution is increasing day by day => 1.1297395470753946
	- the concert was just great => 1.4959427190164005
	- i love to see gordon ramsay cook => 0
	- google is introducing a new technology => 0.18383834567413462
	- ai robots are examples of great technology present today => 0.7797604325216756
	- all of us were singing in the concert => 1.3733655989909488
	- we have launch campaigns to stop pollution and global warming => 0

Concept 2:
	- the amount of polution is increasing day by day => 0
	- the concert was just great => 0
	- i love to see gordon ramsay cook => 1.8337467336425488
	- google is introducing a new technology => 0
	- ai robots are examples of great technology present today => 0
	- all of us were singing in the concert => 0
	- we have launch campaigns to stop pollution and global warming => 1.2850142324187028

Concept 3:
	- the amount of polution is increasing day by day => 0.6242100916830934
	- the concert was just great => 0
	- i love to see