In [1]:

from gensim.models import LdaModel
from gensim.corpora import Dictionary, MmCorpus
import numpy as np
import pandas as pd
    
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

#import bokeh.plotting as bp
#from bokeh.plotting import save
#from bokeh.models import HoverTool

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#!pip install pyLDAvis --user
try:
    import pyLDAvis.sklearn
except:
    !pip install --user pyLDAvis
finally:
    import pyLDAvis.sklearn
    
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

%config InlineBackend.figure_format = 'retina'

import warnings
warnings.filterwarnings("ignore")

## First LDA attempt. Note: intresting clusterization of badwods!

In [4]:
df = pd.read_pickle('../data/df_preprocessed_eng_remap.pckle')
df.head()

Unnamed: 0,description,event_id,category,category_label,lang,lang_reliab,desc_stemm,desc_lemm,desc_stemm_no_badwords,desc_lemm_no_badwords,remap_category,remap_category_label
0,"<p><span>Hey explorersssss, what's up?</span><...",239719250,language/ethnic identity,15,English,99,secret spot hong kong night view food adventur...,secret spot hong kong night view food adventur...,secret spot hong kong night view food adventur...,secret spot hong kong night view food adventur...,language/ethnic identity,11
1,"<p>Free, unauditioned, collaborative pop choir...",gpjktmywhbnb,music,18,English,99,sing us free unaudit collabor pop choir identi...,sing u free unauditioned collaborative pop cho...,sing us unaudit collabor pop choir identifi wo...,sing u unauditioned collaborative pop choir id...,music,13
2,"<p>We provide a forum to learn about, promote ...",drrtzmywhbgb,career/business,3,English,98,east valley busi network meetup provid forum l...,east valley business network meetup provide fo...,east valley busi network provid forum learn pr...,east valley business network provide forum lea...,career/business,1
3,<p><b>【WhyNot!?JAPAN + MeetUp Collaboration Ev...,239719229,socializing,27,English,92,friendli friday whynot japan meetup collabor e...,friendly friday whynot japan meetup collaborat...,friendli whynot japan collabor whynot japan si...,friendly whynot japan collaboration whynot jap...,socializing,20
4,<p>This is an introductory meeting to get to k...,mtzxwmywjbjb,tech,30,English,99,authent option introductori meet get know fell...,authentication option introductory meet get kn...,authent option introductori get fellow coder s...,authentication option introductory get fellow ...,tech,22


### Vectorization

In [6]:
tf_vectorizer = TfidfVectorizer(analyzer = "word", max_features = 1000 )

vectorized = tf_vectorizer.fit_transform(df.desc_lemm)
vectorized.shape

vectorized.A

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### LDA model with 32 topics

In [7]:
df.remap_category.unique().shape

(24,)

In [8]:

lda_model = LatentDirichletAllocation(n_components=24, random_state=0, n_jobs=40)

X_topics = lda_model.fit_transform(vectorized)

word_dict = {}
for topic_idx, topic in enumerate(lda_model.components_):
    word_dict[topic_idx] = [tf_vectorizer.get_feature_names()[i] for i in topic.argsort()[:-10 - 1:-1]]

pd.DataFrame(word_dict).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,movie,night,pm,happy,beer,hour,drink,tuesday,meet,wednesday
1,workout,fitness,class,training,body,get,exercise,water,fun,session
2,life,learn,workshop,help,work,change,relationship,health,self,talk
3,data,project,talk,use,developer,learn,technology,design,work,cloud
4,business,estate,real,marketing,investor,market,learn,property,strategy,invest
5,walk,coffee,park,tea,meet,garden,bring,picnic,let,lunch
6,ticket,music,wine,event,band,dance,pm,night,live,party
7,dance,salsa,english,class,lesson,language,spanish,conversation,pm,beginner
8,business,network,event,group,professional,networking,woman,meeting,meet,member
9,sing,song,study,church,child,music,god,sunday,service,group


## LDA VIZ

In [9]:
pyLDAvis.sklearn.prepare(lda_model, vectorized, tf_vectorizer, mds='pcoa') #see cluster 17

In [17]:
p = pyLDAvis.sklearn.prepare(lda_model, vectorized, tf_vectorizer, mds='mmds') #30 32 31 19


In [None]:
pyLDAvis.save_html(p, 'lda.html')

In [11]:
pyLDAvis.sklearn.prepare(lda_model, vectorized, tf_vectorizer, mds='mmds') #30 32 31 19

In [12]:
pyLDAvis.sklearn.prepare(lda_model, vectorized, tf_vectorizer, mds='tsne') #32

# Check after "badwords" cleansing

In [13]:
tf_vectorizer = TfidfVectorizer(analyzer = "word", stop_words='english', max_features = 1000 )

vectorized = tf_vectorizer.fit_transform(df.desc_lemm_no_badwords)
vectorized.shape

vectorized.A

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
lda_model = LatentDirichletAllocation(n_components=24, random_state=0, n_jobs=40)

X_topics = lda_model.fit_transform(vectorized)

word_dict = {}
for topic_idx, topic in enumerate(lda_model.components_):
    word_dict[topic_idx] = [tf_vectorizer.get_feature_names()[i] for i in topic.argsort()[:-10 - 1:-1]]

pd.DataFrame(word_dict).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,meeting,topic,discussion,talk,monthly,discuss,location,open,idea,share
1,party,drink,single,prepay,dress,night,bar,mingle,purchase,code
2,yoga,class,body,mat,flow,practice,movement,level,strength,exercise
3,meditation,practice,mind,yoga,class,experience,mindfulness,stress,life,body
4,dinner,food,comedy,restaurant,menu,drink,vegan,cook,dish,eat
5,class,course,art,learn,workshop,student,basic,skill,training,session
6,business,network,estate,real,startup,learn,investor,professional,marketing,entrepreneur
7,coffee,english,language,conversation,write,spanish,french,chat,tea,practice
8,ride,hike,trail,mile,bike,park,run,route,pace,road
9,game,play,player,team,night,soccer,board,league,tournament,fun


In [15]:
pyLDAvis.sklearn.prepare(lda_model, vectorized, tf_vectorizer, mds='pcoa') #see cluster 17

In [16]:
pyLDAvis.sklearn.prepare(lda_model, vectorized, tf_vectorizer, mds='mmds') #30 32 31 19