In [1]:
import re

from bs4 import BeautifulSoup 

try:
    import gensim
except:
    !pip install gensim --user
finally:
    import gensim
    
from gensim.models import LdaModel
from gensim.corpora import Dictionary, MmCorpus
#from langdetect import detect
import numpy as np
import pandas as pd
try:
    import spacy
except: 
    !pip install spacy --user
finally:
    import spacy
    
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

#import bokeh.plotting as bp
#from bokeh.plotting import save
#from bokeh.models import HoverTool

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#!pip install pyLDAvis --user
try:
    import pyLDAvis.sklearn
except:
    !pip install --user pyLDAvis
finally:
    import pyLDAvis.sklearn
    
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

%config InlineBackend.figure_format = 'retina'

import warnings
warnings.filterwarnings("ignore")

## First LDA attempt. Note: intresting clusterization of badwods!

In [3]:
df = pd.read_pickle('../data/df_preprocessed_eng.pckle')

### Vectorization

In [4]:
tf_vectorizer = TfidfVectorizer(analyzer = "word", stop_words='english', max_features = 1000 )

vectorized = tf_vectorizer.fit_transform(df.desc_token)
vectorized.shape

vectorized.A

### LDA model with 32 topics

In [6]:
df.category.unique().shape

(32,)

In [9]:

lda_model = LatentDirichletAllocation(n_components=32, random_state=0, n_jobs=40)

X_topics = lda_model.fit_transform(vectorized)

word_dict = {}
for topic_idx, topic in enumerate(lda_model.components_):
    word_dict[topic_idx] = [tf_vectorizer.get_feature_names()[i] for i in topic.argsort()[:-10 - 1:-1]]

pd.DataFrame(word_dict).T

## LDA VIZ

In [12]:
pyLDAvis.sklearn.prepare(lda_model, vectorized, tf_vectorizer, mds='pcoa')

In [13]:
pyLDAvis.sklearn.prepare(lda_model, vectorized, tf_vectorizer, mds='mmds')

In [None]:
pyLDAvis.sklearn.prepare(lda_model, vectorized, tf_vectorizer, mds='tsne') #remember: topic 25 is cool!

# Check after "badwords" cleansing

In [5]:
tf_vectorizer = TfidfVectorizer(analyzer = "word", stop_words='english', max_features = 1000 )

vectorized = tf_vectorizer.fit_transform(df.desc_stemm_no_badwords)
vectorized.shape

vectorized.A

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [7]:
lda_model = LatentDirichletAllocation(n_components=32, random_state=0, n_jobs=40)

X_topics = lda_model.fit_transform(vectorized)

word_dict = {}
for topic_idx, topic in enumerate(lda_model.components_):
    word_dict[topic_idx] = [tf_vectorizer.get_feature_names()[i] for i in topic.argsort()[:-10 - 1:-1]]

pd.DataFrame(word_dict).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,march,vote,nation,state,scienc,commun,action,support,hous,societi
1,hike,climb,particip,respons,organ,risk,agre,injuri,liabil,safeti
2,yoga,class,mat,flow,breath,bodi,practic,medit,level,relax
3,heal,medit,energi,bodi,yoga,mind,stress,sound,class,emot
4,parti,night,danc,drink,dj,bar,dress,singl,music,loung
5,english,languag,spanish,convers,speak,french,japanes,exchang,practic,speaker
6,volleybal,play,beach,sport,song,shirt,level,black,fun,skill
7,discuss,write,book,read,project,topic,work,talk,writer,share
8,ride,bike,rout,mile,rider,road,pace,cycl,park,trail
9,movi,comedi,film,screen,star,watch,stori,night,book,seat


In [8]:
pyLDAvis.sklearn.prepare(lda_model, vectorized, tf_vectorizer, mds='pcoa')

In [9]:
pyLDAvis.sklearn.prepare(lda_model, vectorized, tf_vectorizer, mds='mmds')