In [1]:
#importing libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [2]:
import nltk

In [3]:
dataset = ["The amount of polution is increasing day by day",
           "The concert was just great",
           "I love to see Gordon Ramsay cook",
           "Google is introducing a new technology",
           "AI Robots are examples of great technology present today",
           "All of us were singing in the concert",
           "We have launch campaigns to stop pollution and global warming"]
dataset
#Note this dataset is already preprocessed but we are going to convert all capital letters to small letters.

['The amount of polution is increasing day by day',
 'The concert was just great',
 'I love to see Gordon Ramsay cook',
 'Google is introducing a new technology',
 'AI Robots are examples of great technology present today',
 'All of us were singing in the concert',
 'We have launch campaigns to stop pollution and global warming']

In [4]:
#dataset preprocessing
dataset=[line.lower() for line in dataset]
dataset

['the amount of polution is increasing day by day',
 'the concert was just great',
 'i love to see gordon ramsay cook',
 'google is introducing a new technology',
 'ai robots are examples of great technology present today',
 'all of us were singing in the concert',
 'we have launch campaigns to stop pollution and global warming']

In [5]:
#now we are going to create a BOW model from this dataset which can be a TF-IDF model or binary BOW model
#note there are already classes availaible binary BOW model from text and one of such classes is TfidfVectorizer
#which can create TF-IDF model from this list of strings.
#So why are we using this TfidfVectorizer when we can create our own model?
#the reason is that this Vectorizer not only converts the dataset into TF-IDF modlel it also has additional features
#as well like you can get the different feature names. So we can know which column corresponds to which word and so on
#but you can also use your own tfidf model

vectorizer = TfidfVectorizer()  #creating object
X = vectorizer.fit_transform(dataset) #what this will do is it will convert the dataset into a TF-IDF model

print(X[0])
#here we have specified 0 which means we are checking the first document

  (0, 5)	0.3211483974289089
  (0, 9)	0.6422967948578178
  (0, 17)	0.3211483974289089
  (0, 19)	0.2665807498646048
  (0, 26)	0.3211483974289089
  (0, 24)	0.2278643877752444
  (0, 2)	0.3211483974289089
  (0, 34)	0.2278643877752444


In [6]:
#now we are going to decompose this matrix into those 3 matrices and we are going to do that using another class
#TruncatedSVD

lsa = TruncatedSVD(n_components = 4, n_iter = 100)#here n_components is no. of concepts you want to find from the data
#n_iter is the no. of iterations so when you are decomposing a matrix in the first iteration it is does something ,
#in the second iteration it tries to modify it and make it better and so on. So it will do 100 iteraions to properly
#decompose that matix into 3 matrices and you can even pass here 300 (the higher the better)

lsa.fit(X) #fitting the X and creating the whole model

TruncatedSVD(n_components=4, n_iter=100)

In [7]:
#now lets find out first row of the model
row1 = lsa.components_[0]
row1
#when we were talking about SVD we decomposed the matrix into 3 matrices but using this truncated SVD you can get only
#that last one which is V transpose (r*n) and those n columns are n different word and r rows are the concepts

array([ 1.24191973e-01,  1.78240252e-01,  1.14460798e-01, -7.98450857e-18,
        1.24191973e-01,  1.14460798e-01, -7.98450857e-18,  3.44988739e-01,
       -4.62382034e-17,  2.28921595e-01,  1.24191973e-01, -7.98450857e-18,
        9.72770950e-02, -4.62382034e-17,  3.00124026e-01, -7.98450857e-18,
        1.78240252e-01,  1.14460798e-01,  9.72770950e-02,  1.75760635e-01,
        2.37365829e-01, -7.98450857e-18, -4.62382034e-17,  9.72770950e-02,
        2.95798061e-01, -7.98450857e-18,  1.14460798e-01,  1.24191973e-01,
       -4.62382034e-17,  1.24191973e-01, -4.62382034e-17,  1.78240252e-01,
       -7.98450857e-18,  1.83838346e-01,  3.76098295e-01, -4.50095076e-17,
        1.24191973e-01,  1.78240252e-01, -7.98450857e-18,  2.37365829e-01,
       -7.98450857e-18,  1.78240252e-01])

In [8]:
#here when we have written lsa.components_[0] it means that i am returning the first row of the V transpose matrix
#some words are in this concepts which have very high value and some words that are not in this concepts which have 
#very low value here. So now have the concepts value for all the different words. Now what we are going to do  We is 
#lets display this whole thing properly. We are going to display that corresponding to each concept which are the most 
#importnt words. In case of google corresponding to each concept google has a list of keywords and those are the most
#important keywords that occur in that specific concept.

terms = vectorizer.get_feature_names() #using this we can get all the different words that are in the tfidf model
print(terms)

['ai', 'all', 'amount', 'and', 'are', 'by', 'campaigns', 'concert', 'cook', 'day', 'examples', 'global', 'google', 'gordon', 'great', 'have', 'in', 'increasing', 'introducing', 'is', 'just', 'launch', 'love', 'new', 'of', 'pollution', 'polution', 'present', 'ramsay', 'robots', 'see', 'singing', 'stop', 'technology', 'the', 'to', 'today', 'us', 'warming', 'was', 'we', 'were']


In [9]:
len(terms)

42

In [10]:
#note here we have all the terms i.e., 42 terms that is the reason why we had 42 in row1
#now the main job

for i,comp in enumerate(lsa.components_): #looping through all the different rows of v transpose matrix
    #here this enumerate each time will return the index and the row
    componentTerms = zip(terms,comp)
    #note in the terms we have 42 different words so here we wil generate a new list where the new list will be a list
    #of tuples and each tuple will contain the word and its corresponding concept value
    sortedTerms = sorted(componentTerms,key=lambda x:x[1],reverse=True)
    #here we are sorting the whole component terms so the words that have the highest component value would be at the 
    #top and so on. lambda x:x[1] means x corresponds to the each of tuples stored in the whole list of component
    #terms and x[1] is the concept value so what we are telling using this is that you are going to sort this whole
    #component list based on the concept value of all the different words. Now if we used x:x[0] instead of this we
    #would sort it using the word and reverse = True means sorting in decending order
    sortedTerms = sortedTerms[:10] #it will select only 10 most important terms in a specific concept
    print("\nConcept ",i,":")
    for term in sortedTerms:
        print(term)


Concept  0 :
('the', 0.3760982952926376)
('concert', 0.344988739233066)
('great', 0.3001240258948742)
('of', 0.2957980609526667)
('just', 0.23736582929791256)
('was', 0.23736582929791256)
('day', 0.22892159541504534)
('technology', 0.1838383456741343)
('all', 0.17824025175628944)
('in', 0.17824025175628944)

Concept  1 :
('to', 0.41578844396700687)
('cook', 0.28359165793510716)
('gordon', 0.28359165793510716)
('love', 0.28359165793510716)
('ramsay', 0.28359165793510716)
('see', 0.28359165793510716)
('and', 0.21730644711292482)
('campaigns', 0.21730644711292482)
('global', 0.21730644711292482)
('have', 0.21730644711292482)

Concept  2 :
('technology', 0.3779180676714393)
('is', 0.3419614380631994)
('google', 0.34139694419097494)
('introducing', 0.34139694419097494)
('new', 0.34139694419097494)
('day', 0.14112432680994852)
('are', 0.11387892195372905)
('examples', 0.11387892195372905)
('present', 0.11387892195372905)
('robots', 0.11387892195372905)

Concept  3 :
('day', 0.46542676790411

In [12]:
#it might not be very accurate but it is pretty good
#so now under each concept we have all the different keywords along with their probabilities as well now we can use 
#this to find out which document is from which concept 

In [13]:
#now we are going to find out what is the concept for which document
#crating a dictionary which will store all the concepts mapped with the list of the different tuples
concept_words = {}

terms = vectorizer.get_feature_names()
for i,comp in enumerate(lsa.components_):
    componentTerms = zip(terms,comp)
    sortedTerms = sorted(componentTerms,key=lambda x:x[1],reverse=True)
    sortedTerms = sortedTerms[:10]
    concept_words["Concept "+str(i)] = sortedTerms

print(concept_words)

{'Concept 0': [('the', 0.3760982952926376), ('concert', 0.344988739233066), ('great', 0.3001240258948742), ('of', 0.2957980609526667), ('just', 0.23736582929791256), ('was', 0.23736582929791256), ('day', 0.22892159541504534), ('technology', 0.1838383456741343), ('all', 0.17824025175628944), ('in', 0.17824025175628944)], 'Concept 1': [('to', 0.41578844396700687), ('cook', 0.28359165793510716), ('gordon', 0.28359165793510716), ('love', 0.28359165793510716), ('ramsay', 0.28359165793510716), ('see', 0.28359165793510716), ('and', 0.21730644711292482), ('campaigns', 0.21730644711292482), ('global', 0.21730644711292482), ('have', 0.21730644711292482)], 'Concept 2': [('technology', 0.3779180676714393), ('is', 0.3419614380631994), ('google', 0.34139694419097494), ('introducing', 0.34139694419097494), ('new', 0.34139694419097494), ('day', 0.14112432680994852), ('are', 0.11387892195372905), ('examples', 0.11387892195372905), ('present', 0.11387892195372905), ('robots', 0.11387892195372905)], 'Con

In [14]:
#now we are going to use it to know which document is from which concept
for key in concept_words.keys(): #looping through all of the concepts
    sentence_scores = [] #it is going to store the scores for all the different sentences for this specific concept
    for sentence in dataset:
        words = nltk.word_tokenize(sentence)
        score = 0 #this is going to contain the score for each of the different sentences in the whole dataset for a
        #specific context
        for word in words:
            for word_with_score in concept_words[key]:#looping through list of tuples corresponding to a concept
                if word == word_with_score[0]:
                    score+=word_with_score[1]
        sentence_scores.append(score)
    print("\n"+key+":")
    for sentence_score in sentence_scores:
        print(sentence_score)
    
#so what did we do here here for each of the sentence in the spescific concept we are tokenizing each of those
#sentences into this list of words and also for each sentence we are initializing the score to be 0 and then looping 
#through all the words in the sentence and inside that we are looping through all the words in the specific concept
#and then we are trying to see that the words that specific sentence contains wheather any of those words are in the 
#keyword list of the concept and if it contains any of the word we are simply adding their scores and after that we
#are appending that score to the sentence with scores
                
#so we can see that in concept 0 is second one (i.e., with hoghest score) which is "the concert was just great"
#also note in second concept we have a misclassification of the last sentence which shows it is not very accurate
#but still gets the job done now the reason for misclassification is that we have very less amount of data


Concept 0:
1.129739547075395
1.4959427190164032
0
0.1838383456741343
0.7797604325216752
1.3733655989909492
0

Concept 1:
0
0
1.8337467336425428
0
0
0
1.2850142324187064

Concept 2:
0.6242100916830964
0
0
1.7440703383075635
0.8334337554863556
0
0

Concept 3:
2.2015937554478855
0.127242131806944
0
0.21264455202449883
0
0.29658207438874207
0
