In [1]:
import re

from bs4 import BeautifulSoup 
import gensim
from gensim.models import LdaModel
from gensim.corpora import Dictionary, MmCorpus
from langdetect import detect
import numpy as np
import pandas as pd
try:
    import spacy
except: 
    !pip install spacy #--user
finally:
    import spacy
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

#import bokeh.plotting as bp
#from bokeh.plotting import save
#from bokeh.models import HoverTool

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#!pip install pyLDAvis --user 
import pyLDAvis.sklearn
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

%config InlineBackend.figure_format = 'retina'

import warnings
warnings.filterwarnings("ignore")

## First LDA attempt. Note: intresting clusterization of badwods!

In [2]:
meetup_topcats_samp = pd.read_csv('processed_description_sine_error.csv')

In [3]:
meetup_topcats_samp

Unnamed: 0.1,Unnamed: 0,description,category_bin,lang_ok,processed_description
0,0,<p>FOOD AND FELLOWSHIP | 5 PM</p> \n<p>CONCERT...,27,english,food fellowship pm concert pm may th join us m...
1,1,<p><b>【WhyNot!?JAPAN + MeetUp Collaboration Ev...,28,english,whynot japan meetup collabor event peopl come ...
2,2,<p>TENTATIVE<br></p>\n<p><br></p>\n<p>10.30 - ...,31,english,tentat introduct orient session break network ...
3,3,"<p><span>Hey explorersssss, what's up?</span><...",15,english,hey explorersssss say night view hong kong vic...
4,4,"<p>Free, unauditioned, collaborative pop choir...",18,english,free unaudit collabor pop choir identifi women...
5,6,"<p>We provide a forum to learn about, promote ...",3,english,provid forum learn promot grow member busi
6,7,<p><b>【WhyNot!?JAPAN + MeetUp Collaboration Ev...,28,english,whynot japan meetup collabor event peopl come ...
7,8,<p>This is an introductory meeting to get to k...,31,english,introductori meet get know fellow coder share ...
8,9,"<p>The Future of the IoT with Rajeev Suri, CEO...",31,english,futur iot rajeev suri ceo kathrin buvac chief ...
9,10,"<p><img src=""https://secure.meetupstatic.com/p...",13,english,breakthrough weight loss wk workshop lose weig...


In [24]:
meetup_topcats_samp=meetup_topcats_samp[meetup_topcats_samp.processed_description.isna()!=True]

### Vectorization

In [25]:
tf_vectorizer = TfidfVectorizer(analyzer = "word", stop_words='english', max_features = 1000)

In [26]:
vectorized = tf_vectorizer.fit_transform(meetup_topcats_samp.processed_description)
vectorized.shape

vectorized.A

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### LDA model with 33 topics

In [29]:
lda_model = LatentDirichletAllocation(n_components=33, random_state=0, n_jobs=4)

In [30]:
X_topics = lda_model.fit_transform(vectorized)

In [31]:
word_dict = {}
for topic_idx, topic in enumerate(lda_model.components_):
    word_dict[topic_idx] = [tf_vectorizer.get_feature_names()[i] for i in topic.argsort()[:-10 - 1:-1]]

pd.DataFrame(word_dict).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,workout,fit,class,train,bodi,weight,session,exercis,pm,rd
1,game,play,bring,board,night,fun,drink,pm,prize,tabl
2,event,danc,ticket,band,pm,parti,music,night,drink,test
3,heal,energi,reiki,bodi,medit,class,practic,spiritu,sound,yoga
4,menu,food,cook,restaur,dish,eat,dinner,meal,pleas,soon
5,ride,hike,trail,mile,bike,park,rout,road,pace,rider
6,hike,particip,respons,event,agre,organ,risk,injuri,liabil,safeti
7,busi,network,market,event,meet,profession,entrepreneur,group,help,grow
8,paddl,boat,water,kayak,beach,river,climb,fish,bring,park
9,music,perform,comedi,song,sing,festiv,ticket,pm,artist,art


## LDA VIZ

In [39]:
pyLDAvis.sklearn.prepare(lda_model, vectorized, tf_vectorizer, mds='pcoa')

In [40]:
pyLDAvis.sklearn.prepare(lda_model, vectorized, tf_vectorizer, mds='mmds')

In [41]:
pyLDAvis.sklearn.prepare(lda_model, vectorized, tf_vectorizer, mds='tsne') #remember: topic 25 is cool!