# <center> Topic Modeling with Gensim </center>
---

In [1]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

newsgroups_train = fetch_20newsgroups(subset='train',
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)

In [2]:
from nltk.corpus import stopwords
cachedStopWords = stopwords.words("english")

from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
import re

In [3]:
def tokenize(text):
    min_length = 3   # 최소 단어 크기
                     # nltk의 tokenizer ->  word 추출 -> 소문자로 변환
    words = map(lambda word: word.lower(), word_tokenize(text))
    words = [word for word in words if word not in cachedStopWords]
 
    tokens = (list(map(lambda token: PorterStemmer().stem(token),words))) # 알파벳 추출

    p = re.compile('[a-zA-Z]+');
    filtered_tokens = list(filter (lambda token: p.match(token) and len(token) >= min_length,tokens))
    return filtered_tokens

In [4]:
docs = [tokenize(doc) for doc in newsgroups_train.data]

In [5]:
print(newsgroups_train.data[0])
print(docs[0])

Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych
['notic', 'save', 'model', 'map', 'plane', 'posit', 'care', 'file', 'reload', 'restart', 'given', 'default', 'posit', 'orient', 'save', 'file', 'positions/orient', 'preserv', 'anyon', 'know', 'inform', 'store', 'file', 'noth', 'explicitli', 'said', 'manual', 'save', 'textur', 'rule', 'file', 'like', 'abl', 'read', 'textur', 'rule', 'inform', 'anyon', 'format', 'file', 'file', 'format', 'a

Gensim으로 topic modeling을 하기 위해서는 Dictionary, Corpus, Model의 3단계를 거쳐야 함

In [6]:
from gensim.corpora import Dictionary

dictionary = Dictionary(docs)
print('Number of unique words in initital documents:', len(dictionary))

dictionary.filter_extremes(no_below=3, no_above=0.5)
print('Number of unique words after removing rare and common words:', len(dictionary))

Number of unique words in initital documents: 19839
Number of unique words after removing rare and common words: 5630


In [7]:
corpus = [dictionary.doc2bow(doc) for doc in docs]
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 5630
Number of documents: 2034


In [8]:
from gensim.models import LdaModel

num_topics = 25
chunksize = 500
passes = 20 
iterations = 40
eval_every = 1  

temp = dictionary[0]  
id2word = dictionary.id2token

%time model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)

Wall time: 49.9 s


In [9]:
#! pip install pyldavis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [10]:
pyLDAvis.gensim.prepare(model, corpus, dictionary)