In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from tqdm import tqdm_notebook # progress bar
from gensim import corpora # corpus LDA로 돌릴 수 있는 형태로 변환해주는 기능 
from gensim import models

import warnings # 경고 알림 제거
import nltk

warnings.filterwarnings("ignore", category=DeprecationWarning) # 경고 알림이 뜨면 모두 무시합니다.

## 데이터 불러오기

In [2]:
# 파일 불러오기

train = pd.read_csv('./open/train.csv', encoding='utf-8')
test_x = pd.read_csv('./open/test_x.csv', encoding='utf-8')
submission = pd.read_csv('./open/sample_submission.csv', encoding='utf-8')

In [3]:
X = train.loc[:, 'text']
y = train.loc[:, 'author']

In [4]:
y.unique()

array([3, 2, 1, 4, 0])

In [5]:
count_vect = CountVectorizer(stop_words='english')
feat_vect = count_vect.fit_transform(X)

#### LDA 객체 생성 후 Count 피처 벡터화 객체로 LDA수행

In [6]:
lda = LatentDirichletAllocation(n_components=5, random_state=13)
lda.fit(feat_vect)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=13, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [7]:
print(lda.components_.shape)
lda.components_

(5, 34416)


array([[3.66522832, 0.22017879, 0.23746337, ..., 1.18686389, 0.200005  ,
        0.20000261],
       [0.24296878, 0.20964094, 0.20343087, ..., 0.21446174, 0.20857241,
        0.20001067],
       [0.2015402 , 0.20043959, 0.20045399, ..., 0.20186388, 0.2576835 ,
        1.1957053 ],
       [2.6868477 , 7.16653139, 0.2018153 , ..., 0.20000465, 0.20046145,
        0.20058243],
       [0.203415  , 0.20320929, 4.15683647, ..., 2.19680583, 1.13327764,
        0.20369898]])

In [8]:
count_vect.get_feature_names()

['000',
 '10',
 '100',
 '1000',
 '109',
 '10_s_',
 '11',
 '114th',
 '117',
 '12',
 '120',
 '126b',
 '127',
 '129',
 '12_s_',
 '12th',
 '13',
 '13th',
 '14',
 '140',
 '1429',
 '1456',
 '146m',
 '14th',
 '15',
 '150',
 '15_th',
 '15th',
 '15º',
 '16',
 '1647',
 '1676',
 '16a',
 '16th',
 '17',
 '171',
 '1715',
 '1733',
 '1742',
 '1745',
 '1748',
 '1749',
 '1750',
 '1751',
 '1756',
 '1757',
 '1764',
 '1767',
 '1772',
 '1792',
 '17__',
 '17_th_',
 '18',
 '1803',
 '1810',
 '1812',
 '1814',
 '1820',
 '1826',
 '1830',
 '1840',
 '1855',
 '1856',
 '1859',
 '1860',
 '1861',
 '1862',
 '1865',
 '1869',
 '1870',
 '1874',
 '1875',
 '1876',
 '1878',
 '1882',
 '1883',
 '1884',
 '1887',
 '1888',
 '1890',
 '1891',
 '1894',
 '1895',
 '1898',
 '18th',
 '19',
 '1908',
 '1914',
 '19o',
 '1_s_',
 '1st',
 '20',
 '200',
 '21',
 '21st',
 '22',
 '220',
 '221b',
 '22nd',
 '23',
 '23l',
 '23rd',
 '24',
 '247',
 '2473',
 '249',
 '25',
 '250',
 '25º',
 '26',
 '26th',
 '27',
 '270',
 '2704',
 '28',
 '28th',
 '29',
 '2

## 텍스트 전처리

In [9]:
train.head()

Unnamed: 0,index,text,author
0,0,"He was almost choking. There was so much, so m...",3
1,1,"“Your sister asked for it, I suppose?”",2
2,2,"She was engaged one day as she walked, in per...",1
3,3,"The captain was in the porch, keeping himself ...",4
4,4,"“Have mercy, gentlemen!” odin flung up his han...",3


In [10]:
text = train[['text']]
text.head()

Unnamed: 0,text
0,"He was almost choking. There was so much, so m..."
1,"“Your sister asked for it, I suppose?”"
2,"She was engaged one day as she walked, in per..."
3,"The captain was in the porch, keeping himself ..."
4,"“Have mercy, gentlemen!” odin flung up his han..."


### word_tokenize 진행

In [11]:
text['text'] = text.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)
text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,text
0,"[He, was, almost, choking, ., There, was, so, ..."
1,"[“, Your, sister, asked, for, it, ,, I, suppos..."
2,"[She, was, engaged, one, day, as, she, walked,..."
3,"[The, captain, was, in, the, porch, ,, keeping..."
4,"[“, Have, mercy, ,, gentlemen, !, ”, odin, flu..."


### 불용어 처리

In [12]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
text['text'] = text['text'].apply(lambda x: [word for word in x if word not in (stop)])
text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,text
0,"[He, almost, choking, ., There, much, ,, much,..."
1,"[“, Your, sister, asked, ,, I, suppose, ?, ”]"
2,"[She, engaged, one, day, walked, ,, perusing, ..."
3,"[The, captain, porch, ,, keeping, carefully, w..."
4,"[“, Have, mercy, ,, gentlemen, !, ”, odin, flu..."


### 표제어 추출로 3인칭 단수 표현을 1인칭으로 바꾸고, 과거 현재형 동사를 현재형으로 바꿈

In [13]:
from nltk.stem import WordNetLemmatizer
text['text'] = text['text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])
text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,text
0,"[He, almost, choke, ., There, much, ,, much, w..."
1,"[“, Your, sister, ask, ,, I, suppose, ?, ”]"
2,"[She, engage, one, day, walk, ,, peruse, Jane,..."
3,"[The, captain, porch, ,, keep, carefully, way,..."
4,"[“, Have, mercy, ,, gentlemen, !, ”, odin, fli..."


### 길이가 3이하인 단어에 대해서 제거

In [14]:
tokenized_doc = text['text'].apply(lambda x : [word for word in x if len(word) > 3])
tokenized_doc[:5]

0    [almost, choke, There, much, much, want, stran...
1                              [Your, sister, suppose]
2    [engage, walk, peruse, Jane, last, letter, dwe...
3    [captain, porch, keep, carefully, treacherous,...
4    [Have, mercy, gentlemen, odin, fling, hand, wr...
Name: text, dtype: object

In [15]:
# 문서-단어 행렬 만들기
# 어휘(vocabulary) 학습
dictionary = corpora.Dictionary(tokenized_doc)
# 문서-단어 행렬(document-term matrix) 생성
corpus = [dictionary.doc2bow(text) for text in tokenized_doc] 

In [16]:
print(dictionary)

Dictionary(39330 unique tokens: ['Pole', 'There', 'almost', 'bundle', 'choke']...)


In [17]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
corpus_tfidf[0][:5]

[(0, 0.3113167448397729),
 (1, 0.1397987372029451),
 (2, 0.1738269926683925),
 (3, 0.2717629881973959),
 (4, 0.31230125880573745)]

In [18]:
model = models.ldamodel.LdaModel(corpus_tfidf, num_topics=5, id2word=dictionary)

In [19]:
model.show_topic(3, 10)

[('Where', 0.004701606),
 ('Nothing', 0.003899438),
 ('pray', 0.0038925377),
 ('Indeed', 0.0029883625),
 ('lawyer', 0.002967461),
 ('odin', 0.0029194227),
 ('Trot', 0.0027311717),
 ('peace', 0.002453033),
 ('Jack', 0.0024523023),
 ('falter', 0.0023520484)]

In [20]:
# 토픽 개수, 키워드 개수를 정해주는 변수를 추가.
NUM_TOPICS = 5

NUM_TOPIC_WORDS = 200


def build_doc_term_mat(documents):
    # 문서-단어 행렬 만들어주는 함수.
    print("Building document-term matrix.")
    dictionary = corpora.Dictionary(documents)
    corpus = [dictionary.doc2bow(document) for document in documents]
        
    return corpus, dictionary


def print_topic_words(model):

    # 토픽 모델링 결과를 출력해 주는 함수.
    print("\nPrinting topic words.\n")
    
    for topic_id in range(model.num_topics):
        topic_word_probs = model.show_topic(topic_id, NUM_TOPIC_WORDS)
        print("Topic ID: {}".format(topic_id))
        
        for topic_word, prob in topic_word_probs:
            print("\t{}\t{}".format(topic_word, prob))
            
        print("\n")

# document-term matrix를 만들고,
corpus, dictionary = build_doc_term_mat(tokenized_doc)
# LDA를 실행.
model = models.ldamodel.LdaModel(corpus, num_topics=NUM_TOPICS, id2word=dictionary, alpha="auto", eta="auto")
# 결과를 출력.
print_topic_words(model)

Building document-term matrix.

Printing topic words.

Topic ID: 0
	odin	0.02469189651310444
	hand	0.015621831640601158
	look	0.011687669903039932
	come	0.011275449767708778
	upon	0.010625818744301796
	take	0.00864425115287304
	back	0.008244970813393593
	room	0.007952241227030754
	head	0.007783978711813688
	stand	0.007176188752055168
	door	0.0070940060541033745
	turn	0.006844899617135525
	face	0.006041820161044598
	open	0.006012520752847195
	time	0.005682837218046188
	leave	0.005323535297065973
	house	0.004894954152405262
	still	0.004816390573978424
	away	0.0047650462947785854
	walk	0.004720771219581366
	round	0.004685269668698311
	night	0.004380781203508377
	table	0.004345282446593046
	little	0.0043334029614925385
	make	0.0038694266695529222
	light	0.003804920706897974
	like	0.0037545529194176197
	hold	0.003728579031303525
	long	0.0036261258646845818
	shake	0.0035958418156951666
	pass	0.0035934385377913713
	There	0.0035828715190291405
	last	0.0035738779697567225
	step	0.00346448295749

## 시각화

In [21]:
# pyLDAvis 불러오기
import pyLDAvis
import pyLDAvis.gensim

# pyLDAvis를 jupyter notebook에서 실행할 수 있게 활성화.
pyLDAvis.enable_notebook()

# pyLDAvis 실행.
data = pyLDAvis.gensim.prepare(model, corpus, dictionary)
data

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [22]:
pyLDAvis.save_html(data, 'lda.html')

  and should_run_async(code)
