<a href="https://colab.research.google.com/github/ghdakrk/-NLP-/blob/main/gensim.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# gensim package
!pip install gensim
!pip install pyLDAvis



In [2]:
corpus = [
          'This is the first document.',
          'This is the second second document.',
          'And the third one.',
          'Is this the first document?',
          'The last document?',
]
token_list = [[text for text in doc.split()] for doc in corpus]
token_list

[['This', 'is', 'the', 'first', 'document.'],
 ['This', 'is', 'the', 'second', 'second', 'document.'],
 ['And', 'the', 'third', 'one.'],
 ['Is', 'this', 'the', 'first', 'document?'],
 ['The', 'last', 'document?']]

In [3]:
from gensim.corpora import Dictionary

dictionary = Dictionary(token_list)
dictionary.token2id

{'And': 6,
 'Is': 9,
 'The': 12,
 'This': 0,
 'document.': 1,
 'document?': 10,
 'first': 2,
 'is': 3,
 'last': 13,
 'one.': 7,
 'second': 5,
 'the': 4,
 'third': 8,
 'this': 11}

In [4]:
term_matrix = [dictionary.doc2bow(token) for token in token_list]
term_matrix

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)],
 [(0, 1), (1, 1), (3, 1), (4, 1), (5, 2)],
 [(4, 1), (6, 1), (7, 1), (8, 1)],
 [(2, 1), (4, 1), (9, 1), (10, 1), (11, 1)],
 [(10, 1), (12, 1), (13, 1)]]

In [5]:
from gensim.models import TfidfModel

tfidf = TfidfModel(term_matrix)

for doc in tfidf[term_matrix]:
    print("doc:")
    for k, v in doc:
        print(k, v)

doc:
0 0.49633406058198626
1 0.49633406058198626
2 0.49633406058198626
3 0.49633406058198626
4 0.12087183801361165
doc:
0 0.25482305694621393
1 0.25482305694621393
3 0.25482305694621393
4 0.0620568558708622
5 0.8951785160431313
doc:
4 0.07979258234193365
6 0.5755093812740171
7 0.5755093812740171
8 0.5755093812740171
doc:
2 0.3485847413542797
4 0.08489056411237639
9 0.6122789185961829
10 0.3485847413542797
11 0.6122789185961829
doc:
10 0.37344696513776354
12 0.6559486886294514
13 0.6559486886294514


In [6]:
# topic modeling
from sklearn.datasets import fetch_20newsgroups

newsgroups = fetch_20newsgroups(
    categories=['comp.graphics', 'rec.sport.baseball', 'sci.med']
)

In [7]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [8]:
# 명사추출
%%time
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize

tagged_list = [pos_tag(word_tokenize(doc)) for doc in newsgroups.data]
nouns_list = [[t[0] for t in doc if t[1].startswith("N")] for doc in tagged_list]

CPU times: user 24 s, sys: 186 ms, total: 24.1 s
Wall time: 24.1 s


In [9]:
# 표제어 추출
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

lm = WordNetLemmatizer()

nouns_list = [[lm.lemmatize(w, pos="n") for w in doc] for doc in nouns_list]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
# 불용어 제거
import re
token_list = [[text.lower() for text in doc] for doc in nouns_list]
token_list = [[re.sub(r"[^A-Za-z]+" , "", word) for word in doc] for doc in token_list]

In [11]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words += ["", "subject", "article", "line", "year", "month", "address", "keyword", "msg"]

token_list = [[word for word in doc if (word not in stop_words) and (2 < len(word) < 10)]
              for doc in token_list]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
# 결과
from gensim import corpora

dictionary = corpora.Dictionary(token_list)
doc_term_matrix = [dictionary.doc2bow(tokens) for tokens in token_list]

In [13]:
%%time
from gensim.models.ldamodel import LdaModel

model = LdaModel(corpus=doc_term_matrix, id2word=dictionary, num_topics=3,)

CPU times: user 2.58 s, sys: 164 ms, total: 2.74 s
Wall time: 2.56 s


In [14]:
model.print_topics()

[(0,
  '0.014*"lines" + 0.007*"image" + 0.005*"time" + 0.005*"people" + 0.004*"file" + 0.004*"computer" + 0.004*"banks" + 0.004*"problem" + 0.003*"geb" + 0.003*"program"'),
 (1,
  '0.010*"lines" + 0.007*"image" + 0.006*"time" + 0.006*"game" + 0.006*"team" + 0.005*"file" + 0.004*"program" + 0.004*"anyone" + 0.003*"david" + 0.003*"player"'),
 (2,
  '0.012*"lines" + 0.005*"science" + 0.004*"baseball" + 0.004*"game" + 0.004*"point" + 0.004*"way" + 0.003*"number" + 0.003*"time" + 0.003*"center" + 0.003*"food"')]

In [17]:
# visualize
import pyLDAvis
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(model, doc_term_matrix, dictionary)
vis

  by='saliency', ascending=False).head(R).drop('saliency', 1)
