# Resolução da lista 2 de NLP
## Alunos:
    - Eduardo Brasil Araujo
    - Gideão Pinheiro

In [1]:
import pandas as pd

# Questão 1

# Questão 2

In [36]:
data = pd.read_csv('../datasets/website_classification.csv')

In [9]:
X_train = data.cleaned_website_text

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(ngram_range=(1,2),
                                   stop_words='english',
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   max_features=1000)

X = count_vectorizer.fit_transform(X_train)

In [17]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV

search_params = {
  'n_components': [5, 10, 15, 20, 25, 30],
  'learning_decay': [.5, .7]
}

lda = LatentDirichletAllocation(learning_method='online')

grid_search = GridSearchCV(lda, param_grid=search_params, n_jobs=1, verbose=1)
grid_search.fit(X)

print(f'Best model\'s Params: {grid_search.best_params_}')
print(f'Best log likelihood score: {grid_search.best_score_}')

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best model's Params: {'learning_decay': 0.7, 'n_components': 5}
Best log likelihood score: -706514.8406924496


### Respondendo item (a) da 2ª questão:

- Referência: https://investigate.ai/text-analysis/choosing-the-right-number-of-topics-for-a-scikit-learn-topic-model/

É notório que o modelo LDA é melhor para realizar a avaliação de quantos
tópicos utilizar, pois ele proporciona uma métrica de avaliação bem definida,
diferentemente de outros modelos como NMF (pelo menos no scikit-learn).

Assim, foi utilizado este modelo para avaliar a quantidade de tópicos, e
então, foi feito uma busca em grade pela combinação de parâmetros mais bem
adaptados ao problema; com o GridSearchCV. 

O melhor resultado foi com número de tópicos igual a 5, com o decaimento de
aprendizado igual a 0.7.

In [46]:
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF

In [48]:
# LDA
lda = LatentDirichletAllocation(n_components=5, learning_decay=0.7, learning_method='online')
lda.fit(X)

# Creating vectorizer to use subsequently
vectorizer = TfidfVectorizer(stop_words='english', smooth_idf=True)
input_matrix = vectorizer.fit_transform(X_train).todense()

# SVD
svd = TruncatedSVD(n_components=5, algorithm='randomized', n_iter=100)
svd.fit(np.array(input_matrix))

# NMF
nmf = NMF(n_components=5)
nmf.fit_transform(np.array(input_matrix))

array([[0.093414  , 0.00646511, 0.        , 0.00404843, 0.00255859],
       [0.07848126, 0.00372779, 0.00130067, 0.        , 0.        ],
       [0.07879804, 0.        , 0.02438396, 0.        , 0.        ],
       ...,
       [0.03202933, 0.00337896, 0.        , 0.        , 0.        ],
       [0.0120509 , 0.00178291, 0.00112622, 0.00338767, 0.00955339],
       [0.02791929, 0.00742528, 0.01540482, 0.        , 0.05048693]])

In [63]:
def get_topic_list(model, feature_names, n_words):
    topic_list = []
    for topic_idx, topic in enumerate(model.components_):
        top_n = [feature_names[i]
                for i in topic.argsort()
                [-n_words:]][::-1]
        top_features = ' '.join(top_n)
        topic_list.append(f"topic_{'_'.join(top_n[:3])}") 

        print(f"Topic {topic_idx}: {top_features}")
    return topic_list

### Respostas ao item (b) da 2ª questão:

- Referência: https://www.freecodecamp.org/news/advanced-topic-modeling-how-to-use-svd-nmf-in-python/

In [65]:
# LDA
feature_names = count_vectorizer.get_feature_names_out()

topic_list = get_topic_list(lda, feature_names, 5)

amounts = lda.transform(X) * 100
topics = pd.DataFrame(amounts, columns=topic_list)
topics.head()

Topic 0: recipe news read health new
Topic 1: game live play sport video
Topic 2: news december ago league late
Topic 3: information chat use new service
Topic 4: add sex gift view new


Unnamed: 0,topic_recipe_news_read,topic_game_live_play,topic_news_december_ago,topic_information_chat_use,topic_add_sex_gift
0,6.212656,0.050197,0.049967,21.914825,71.772355
1,1.637664,12.056837,0.046797,32.884258,53.374444
2,4.668011,0.107997,0.107036,44.754333,50.362623
3,0.024502,0.024772,0.685394,0.024827,99.240504
4,0.049466,0.04921,25.147949,41.185251,33.568123


In [54]:
input_matrix = np.array(input_matrix)

In [66]:
feature_names = vectorizer.get_feature_names_out()

topic_list = get_topic_list(svd, feature_names, 5)

amounts = svd.transform(input_matrix) * 100
topics = pd.DataFrame(amounts, columns=topic_list)
topics.head()

Topic 0: chat news new room free
Topic 1: chat room free teen chatroom
Topic 2: photography camera photographer photo portrait
Topic 3: recipe recipes chicken food cake
Topic 4: league cricket december photography news


Unnamed: 0,topic_chat_news_new,topic_chat_room_free,topic_photography_camera_photographer,topic_recipe_recipes_chicken,topic_league_cricket_december
0,20.581389,-4.391929,-1.151202,2.108471,-8.16075
1,16.284555,-3.573794,0.279942,0.933472,-8.429621
2,17.616277,-5.325883,4.078882,0.541913,-6.969882
3,6.994366,-2.534251,-1.629478,0.268829,-3.118451
4,24.604695,-5.784838,-1.01401,2.284761,-16.426014


In [67]:
topic_list = get_topic_list(nmf, feature_names, 5)

amounts = nmf.transform(input_matrix) * 100
topics = pd.DataFrame(amounts, columns=topic_list)
topics.head()

Topic 0: video online tv new service
Topic 1: chat room free online people
Topic 2: photography camera photographer photo portrait
Topic 3: recipe recipes chicken food soup
Topic 4: news league december ago cricket


Unnamed: 0,topic_video_online_tv,topic_chat_room_free,topic_photography_camera_photographer,topic_recipe_recipes_chicken,topic_news_league_december
0,9.341118,0.646504,0.0,0.404838,0.25643
1,7.848242,0.372753,0.130072,0.0,0.0
2,7.879874,0.0,2.438416,0.0,0.0
3,3.378191,0.0,0.0,0.0,0.405322
4,12.732086,0.0,0.0,0.0,0.0


### Resposta do item (c) da 2ª questão:

In [84]:
# Choosing documents
choosen_documents = X_train[:5]

num_topics = 3

count_vectorizer = CountVectorizer(ngram_range=(1,2),
                                   stop_words='english',
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   max_features=1000)

# LDA
lda = LatentDirichletAllocation(n_components=num_topics, learning_decay=0.7, learning_method='online')
lda.fit(count_vectorizer.fit_transform(choosen_documents))

# Creating vectorizer to use subsequently
vectorizer = TfidfVectorizer(stop_words='english', smooth_idf=True)
input_matrix = vectorizer.fit_transform(X_train).todense()
input_matrix = np.array(input_matrix)

# SVD
svd = TruncatedSVD(n_components=num_topics, algorithm='randomized', n_iter=100)
svd.fit(input_matrix)

# NMF
nmf = NMF(n_components=num_topics)
nmf.fit_transform(input_matrix)

array([[0.08304664, 0.00999359, 0.        ],
       [0.06466344, 0.00703975, 0.00413756],
       [0.06555299, 0.00204139, 0.0282965 ],
       ...,
       [0.02650137, 0.00501781, 0.        ],
       [0.01635753, 0.00168611, 0.00091806],
       [0.05101699, 0.00620914, 0.01297947]])

In [86]:
print('LDA Topics')
lda_topics = get_topic_list(lda, count_vectorizer.get_feature_names_out(), 5)
print('SVD Topics')
svd_topics = get_topic_list(svd, vectorizer.get_feature_names_out(), 5)
print('NMF Topics')
nmf_topics = get_topic_list(nmf, vectorizer.get_feature_names_out(), 5)

LDA Topics
Topic 0: travel expedia site hotel sign
Topic 1: flights flight travel new site
Topic 2: flights flight new mumbai economy
SVD Topics
Topic 0: chat news new room free
Topic 1: chat room free teen chatroom
Topic 2: photography camera photographer photo portrait
NMF Topics
Topic 0: news new video december world
Topic 1: chat room free online people
Topic 2: photography camera photographer photo portrait


### Resposta do item (d) da 2ª questão:

In [95]:
for doc in choosen_documents:
    print('')
    print('#' * 120)
    new_doc = doc.split(' ')
    NUM_WORDS = 14
    for index, word in enumerate(new_doc):
        print(word, end=' ')
        if index % NUM_WORDS == 0:
            print('')
    # print(doc)


########################################################################################################################
official 
site good hotel accommodation big saving hotel destination worldwide browse hotel review find guarantee 
good price hotel budget lodging accommodation hotel hotels special offer package special weekend break 
city break deal budget cheap discount saving select language find deal hotel home try 
search connect traveller india travel talk community recommend destination flamborough boreland colvend catfield harberton 
warleggan inspiration trip spot winter wildlife beautiful snowy island bye bye work want spontechnaity 
tech drive travel vital value maximise travel homes guest love browse property type hotels 
apartments resorts villa cabins cottage glamping serviced apartment holiday home guest house hostels motels 
ryokans riads holiday park homestays campsites country house farm stay boats luxury tent self 
catering accommodation tiny house chapel saint l

# Questão 3

# Questão 4