# This code file require quora_questions.csv files

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../DATA/quora_questions.csv')

In [4]:
df.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


# 1st article 

In [5]:
df['Question'][0]

'What is the step by step guide to invest in share market in india?'

In [7]:
df.shape

(404289, 1)

# Preprocessing

bcause lda is depend on word count probablitites thats why we can only use CountVectorier
but nnmf works with co-effient values here we can preprocess the text with tfidf vectorization

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
tfidf = TfidfVectorizer(max_df = 0.95, min_df=2, stop_words='english')

In [10]:
dtm = tfidf.fit_transform(df['Question'])

In [11]:
dtm   

<404289x38669 sparse matrix of type '<class 'numpy.float64'>'
	with 2002912 stored elements in Compressed Sparse Row format>

so we have 404289 documents and 38669 features 

In [11]:
# dtm.toarray() do not run this


In [12]:
from sklearn.decomposition import NMF

In [13]:
nmf_model = NMF(n_components=20, random_state=42) # nmf is faster than lda 

In [14]:
nmf_model.fit(dtm)  # wait for 10min

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=20, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

# Grab the vocabulary of words 

In [15]:
len(tfidf.get_feature_names())

38669

In [16]:
type(tfidf.get_feature_names())

list

In [17]:
tfidf.get_feature_names()[4100]

'baleno'

In [18]:
import random 
random_word_id = random.randint(0,54777)
tfidf.get_feature_names()[random_word_id]

'casteism'

# Grab the topics 

In [19]:
len(nmf_model.components_)

20

In [20]:
type(nmf_model.components_)

numpy.ndarray

In [21]:
nmf_model.components_.shape

(20, 38669)

In [22]:
nmf_model.components_

array([[0.00000000e+00, 5.63036920e-02, 5.40156715e-05, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.23892290e-03, 0.00000000e+00, 3.45251649e-05, ...,
        0.00000000e+00, 3.65013292e-03, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [4.07891142e-04, 4.92671304e-03, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [7.84654148e-05, 4.54449173e-04, 6.05797981e-05, ...,
        1.70479939e-03, 0.00000000e+00, 1.70479939e-03],
       [3.45021413e-04, 0.00000000e+00, 4.81932600e-06, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [23]:
single_topic =  nmf_model.components_[0]

In [24]:
single_topic

array([0.00000000e+00, 5.63036920e-02, 5.40156715e-05, ...,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00])

In [25]:
len(single_topic)

38669

In [26]:
single_topic.argsort()

array([    0, 22613, 22611, ...,  5268, 22925,  4632], dtype=int64)

In [27]:
# ARFSORT -- > INDEX POSITIONS SORTED FROM LEAST -- > GREATEST
# TOP 10 VLAUES (10 GREATES VALUES)
# LAST 10 VALUES OF ARGSORT()

single_topic.argsort()[-10:]   # GRAB THE LAST 10 VALUES OF ARGSORT()

array([26057,  5976, 19847, 22924, 37520,   482,  5283,  5268, 22925,
        4632], dtype=int64)

In [28]:
top_twenty_words  = single_topic.argsort()[-20:]

In [29]:
for index in top_twenty_words:
    print(tfidf.get_feature_names()[index])

app
engineering
friend
website
site
thing
read
place
visit
places
phone
buy
laptop
movie
ways
2016
books
book
movies
best


# Grab the highest probability words per topic 

previously we are dealing with words which has highest probablites but now we are dealing with words which has highst coeffients

In [30]:
for i, topic in enumerate(nmf_model.components_):
    print("THE TOP 15 WORDS FOR TOPIC # {}".format(i))
    print([tfidf.get_feature_names()[index]  for index in topic.argsort()[-15:]])
    print('\n')
    print('\n')

THE TOP 15 WORDS FOR TOPIC # 0
['thing', 'read', 'place', 'visit', 'places', 'phone', 'buy', 'laptop', 'movie', 'ways', '2016', 'books', 'book', 'movies', 'best']




THE TOP 15 WORDS FOR TOPIC # 1
['majors', 'recruit', 'sex', 'looking', 'differ', 'use', 'exist', 'really', 'compare', 'cost', 'long', 'feel', 'work', 'mean', 'does']




THE TOP 15 WORDS FOR TOPIC # 2
['add', 'answered', 'needing', 'post', 'easily', 'improvement', 'delete', 'asked', 'google', 'answers', 'answer', 'ask', 'question', 'questions', 'quora']




THE TOP 15 WORDS FOR TOPIC # 3
['using', 'website', 'investment', 'friends', 'black', 'internet', 'free', 'home', 'easy', 'youtube', 'ways', 'earn', 'online', 'make', 'money']




THE TOP 15 WORDS FOR TOPIC # 4
['balance', 'earth', 'day', 'death', 'changed', 'live', 'want', 'change', 'moment', 'real', 'important', 'thing', 'meaning', 'purpose', 'life']




THE TOP 15 WORDS FOR TOPIC # 5
['reservation', 'engineering', 'minister', 'president', 'company', 'china', 'busine

# Attach the topics to the orignal Documemnts

In [31]:
dtm

<404289x38669 sparse matrix of type '<class 'numpy.float64'>'
	with 2002912 stored elements in Compressed Sparse Row format>

In [32]:
npr

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."
...,...
404284,How many keywords are there in the Racket prog...
404285,Do you believe there is life after death?
404286,What is one coin?
404287,What is the approx annual cost of living while...


In [33]:
topic_results = nmf_model.transform(dtm)

In [34]:
topic_results

array([[2.75937605e-04, 5.91249293e-05, 6.17687040e-06, ...,
        6.97269969e-04, 2.13527728e-04, 0.00000000e+00],
       [1.96418670e-04, 8.85438224e-05, 0.00000000e+00, ...,
        0.00000000e+00, 5.51088847e-05, 1.05527238e-05],
       [1.78019854e-04, 6.47373072e-04, 1.60510763e-03, ...,
        3.02354836e-03, 1.05908512e-03, 1.23878889e-03],
       ...,
       [0.00000000e+00, 1.62431955e-05, 5.23720795e-06, ...,
        0.00000000e+00, 2.76279348e-06, 0.00000000e+00],
       [5.36236094e-04, 1.01567857e-03, 0.00000000e+00, ...,
        1.28720137e-04, 7.76975481e-04, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.25187210e-04]])

In [35]:
topic_results[0]

array([2.75937605e-04, 5.91249293e-05, 6.17687040e-06, 4.95880678e-04,
       3.94126495e-05, 2.62022533e-02, 3.92318931e-04, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 2.34257472e-04, 1.15869110e-03,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.97456870e-04,
       0.00000000e+00, 6.97269969e-04, 2.13527728e-04, 0.00000000e+00])

In [36]:
topic_results[0].round(2)   

array([0.  , 0.  , 0.  , 0.  , 0.  , 0.03, 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ])

In [37]:
df['Topic'] = topic_results.argmax(axis=1)

In [38]:
df.head(10)

Unnamed: 0,Question,Topic
0,What is the step by step guide to invest in sh...,5
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,16
2,How can I increase the speed of my internet co...,17
3,Why am I mentally very lonely? How can I solve...,11
4,"Which one dissolve in water quikly sugar, salt...",14
5,Astrology: I am a Capricorn Sun Cap moon and c...,1
6,Should I buy tiago?,0
7,How can I be a good geologist?,10
8,When do you use シ instead of し?,19
9,Motorola (company): Can I hack my Charter Moto...,17
