In [None]:
import numpy as np
import pandas as pd

# Code to read csv file into colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
downloaded = drive.CreateFile({'id':'1fjM5LTtbHpkeI0CxnuMWWc0vC3_ldhw-'}) 
downloaded.GetContentFile('quora_questions.csv') 

quora= pd.read_csv("quora_questions.csv")

quora.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


# Preprocessing

## Task: Use TF-IDF Vectorization to create a vectorized document term matrix. You may want to explore the max_df and min_df parameters.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# max_df -> [0,1] ignoring words with high document frequency, the higher the number the more you ignore
# it will will discard words which appear in 95 (max_df)% of documents

# min_df-> [0,1] or int=raw number of documents, minimun document frequency 
# here min_df=2 => words much appear at least in 2 documtns

# it will remove stopwods
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')


document_term_matrix = tfidf.fit_transform(quora['Question'])

document_term_matrix

<404289x38669 sparse matrix of type '<class 'numpy.float64'>'
	with 2002912 stored elements in Compressed Sparse Row format>

# Non-negative Matrix Factorization

## TASK: Using Scikit-Learn create an instance of NMF with 15 expected components. (Use random_state=42)..

In [None]:
from sklearn.decomposition import NMF

nmf_model = NMF(n_components=15,random_state=42)

nmf_model.fit(document_term_matrix)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=15, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

## TASK: Print our the top 15 most common words for each of the 20 topics.

In [None]:
for index,topic in enumerate(nmf_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names()[index] for index in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['place', 'visit', 'places', 'phone', 'time', 'ways', 'buy', 'laptop', 'movie', '2016', 'books', 'book', 'movies', 'way', 'best']


THE TOP 15 WORDS FOR TOPIC #1
['recruit', 'differ', 'looking', 'use', 'sex', 'exist', 'time', 'really', 'compare', 'cost', 'long', 'feel', 'work', 'mean', 'does']


THE TOP 15 WORDS FOR TOPIC #2
['add', 'answered', 'needing', 'post', 'easily', 'improvement', 'delete', 'asked', 'google', 'answers', 'answer', 'ask', 'question', 'questions', 'quora']


THE TOP 15 WORDS FOR TOPIC #3
['facebook', 'friends', 'black', 'internet', 'free', 'easiest', 'home', 'easy', 'youtube', 'ways', 'way', 'earn', 'online', 'make', 'money']


THE TOP 15 WORDS FOR TOPIC #4
['earth', 'did', 'death', 'changed', 'day', 'live', 'want', 'change', 'moment', 'real', 'important', 'thing', 'meaning', 'purpose', 'life']


THE TOP 15 WORDS FOR TOPIC #5
['minister', 'company', 'engineering', 'china', 'olympics', 'available', 'business', 'job', 'country', 'spotify

## TASK: Add a new column to the original quora dataframe that labels each question into one of the 20 topic categories.

In [None]:
quora.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


In [None]:
topic_results = nmf_model.transform(document_term_matrix)

In [None]:
topic_results.argmax(axis=1)

quora['Topic'] = topic_results.argmax(axis=1)

quora.head(10)

Unnamed: 0,Question,Topic
0,What is the step by step guide to invest in sh...,5
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,8
2,How can I increase the speed of my internet co...,3
3,Why am I mentally very lonely? How can I solve...,11
4,"Which one dissolve in water quikly sugar, salt...",14
5,Astrology: I am a Capricorn Sun Cap moon and c...,1
6,Should I buy tiago?,0
7,How can I be a good geologist?,10
8,When do you use シ instead of し?,12
9,Motorola (company): Can I hack my Charter Moto...,5


# How to assign a single topic in the "top words per topic" (because remember we have to decide the topic label both in LDA and NMF)->

In [None]:
# dictionary:
my_topic_dictionary = {0: "Movies", 1: "Relationships", 2:"Questions", 3:"Internet",4:"Life", 5:"World", 6: "programming",
                       7:"election",8:"business",9:"culture",10:"economy",11:"miscalenious", 12:"bla",13:"langauages",14:"money"}
quora["Topic_Label"] = quora["Topic"].map(my_topic_dictionary)
quora["Topic_Label"]

0                World
1             business
2             Internet
3         miscalenious
4                money
              ...     
404284     programming
404285            Life
404286    miscalenious
404287    miscalenious
404288         culture
Name: Topic_Label, Length: 404289, dtype: object

In [None]:
quora.Topic_Label.unique()

array(['World', 'business', 'Internet', 'miscalenious', 'money',
       'Relationships', 'Movies', 'economy', 'bla', 'Questions',
       'culture', 'election', 'programming', 'langauages', 'Life'],
      dtype=object)

In [None]:
quora.head()

Unnamed: 0,Question,Topic,Topic_Label
0,What is the step by step guide to invest in sh...,5,World
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,8,business
2,How can I increase the speed of my internet co...,3,Internet
3,Why am I mentally very lonely? How can I solve...,11,miscalenious
4,"Which one dissolve in water quikly sugar, salt...",14,money


In [None]:
quora.Topic.nunique()

15

In [None]:
quora.Topic_Label.nunique()

15