In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from tqdm import tqdm_notebook # progress bar
from gensim import corpora # corpus LDA로 돌릴 수 있는 형태로 변환해주는 기능 
from gensim import models

import warnings # 경고 알림 제거
import nltk

warnings.filterwarnings("ignore", category=DeprecationWarning) # 경고 알림이 뜨면 모두 무시합니다.

## 데이터 불러오기

In [2]:
# 파일 불러오기

train = pd.read_csv('./open/train.csv', encoding='utf-8')
test_x = pd.read_csv('./open/test_x.csv', encoding='utf-8')
submission = pd.read_csv('./open/sample_submission.csv', encoding='utf-8')

In [3]:
X = test_x.loc[:, 'text']
# y = train.loc[:, 'author']

In [4]:
count_vect = CountVectorizer(stop_words='english')
feat_vect = count_vect.fit_transform(X)

#### LDA 객체 생성 후 Count 피처 벡터화 객체로 LDA수행

In [5]:
lda = LatentDirichletAllocation(n_components=5, random_state=13)
lda.fit(feat_vect)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=13, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [6]:
print(lda.components_.shape)
lda.components_

(5, 32418)


array([[5.64006211, 0.20220658, 0.2035611 , ..., 1.33513664, 0.20217875,
        0.20400287],
       [0.20215172, 0.20135254, 0.20000198, ..., 0.20085659, 0.20013418,
        0.20000045],
       [0.2000034 , 0.20819362, 0.20000426, ..., 2.05770623, 2.18800942,
        0.20000083],
       [0.20285276, 0.20165163, 1.1938552 , ..., 0.20444586, 0.2054561 ,
        2.19599553],
       [1.75493   , 6.18659563, 0.20257745, ..., 0.20185469, 0.20422155,
        0.20000032]])

In [8]:
count_vect.get_feature_names()

['000',
 '10',
 '100',
 '1000',
 '103',
 '108',
 '11',
 '117th',
 '11th',
 '12',
 '120',
 '126b',
 '13',
 '13th',
 '14',
 '140',
 '1413',
 '1428',
 '144',
 '14th',
 '15',
 '150',
 '1543',
 '1607',
 '1642',
 '1644',
 '16_th_',
 '17',
 '1730',
 '1750',
 '1757',
 '1760',
 '1767',
 '1784',
 '1785',
 '1787',
 '1789',
 '1791',
 '17th',
 '18',
 '1800',
 '1806',
 '1812',
 '1820',
 '1846',
 '1847',
 '1848',
 '1855',
 '1856',
 '1857',
 '1858',
 '1859',
 '1863',
 '1869',
 '1871',
 '1875',
 '1878',
 '1882',
 '1883',
 '1884',
 '1890',
 '1891',
 '1892',
 '1893',
 '1894',
 '1897',
 '18th',
 '19th',
 '1st',
 '20',
 '200',
 '21st',
 '22',
 '221_b_',
 '221b',
 '226',
 '22nd',
 '23rd',
 '247',
 '24th',
 '25',
 '263',
 '26_s_',
 '27',
 '2704',
 '27th',
 '28th',
 '29',
 '30',
 '300',
 '34',
 '341',
 '35',
 '36',
 '3rd',
 '40',
 '45',
 '46',
 '47',
 '4700',
 '4_d_',
 '4th',
 '4½',
 '50',
 '500',
 '534',
 '55',
 '577',
 '5th',
 '60',
 '750',
 '76',
 '77b',
 '7th',
 '82',
 '83',
 '84',
 '85',
 '87',
 '89',
 '

## 텍스트 전처리

In [9]:
test_x.head()

Unnamed: 0,index,text
0,0,“Not at all. I think she is one of the most ch...
1,1,"""No,"" replied he, with sudden consciousness, ""..."
2,2,As the lady had stated her intention of scream...
3,3,“And then suddenly in the silence I heard a so...
4,4,His conviction remained unchanged. So far as I...


In [10]:
text = test_x[['text']]
text.head()

Unnamed: 0,text
0,“Not at all. I think she is one of the most ch...
1,"""No,"" replied he, with sudden consciousness, ""..."
2,As the lady had stated her intention of scream...
3,“And then suddenly in the silence I heard a so...
4,His conviction remained unchanged. So far as I...


### word_tokenize 진행

In [11]:
text['text'] = text.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)
text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,text
0,"[“, Not, at, all, ., I, think, she, is, one, o..."
1,"[``, No, ,, '', replied, he, ,, with, sudden, ..."
2,"[As, the, lady, had, stated, her, intention, o..."
3,"[“, And, then, suddenly, in, the, silence, I, ..."
4,"[His, conviction, remained, unchanged, ., So, ..."


### 불용어 처리

In [12]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
text['text'] = text['text'].apply(lambda x: [word for word in x if word not in (stop)])
text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,text
0,"[“, Not, ., I, think, one, charming, young, la..."
1,"[``, No, ,, '', replied, ,, sudden, consciousn..."
2,"[As, lady, stated, intention, screaming, ,, co..."
3,"[“, And, suddenly, silence, I, heard, sound, s..."
4,"[His, conviction, remained, unchanged, ., So, ..."


### 표제어 추출로 3인칭 단수 표현을 1인칭으로 바꾸고, 과거 현재형 동사를 현재형으로 바꿈

In [13]:
from nltk.stem import WordNetLemmatizer
text['text'] = text['text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])
text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,text
0,"[“, Not, ., I, think, one, charm, young, ladie..."
1,"[``, No, ,, '', reply, ,, sudden, consciousnes..."
2,"[As, lady, state, intention, scream, ,, course..."
3,"[“, And, suddenly, silence, I, hear, sound, se..."
4,"[His, conviction, remain, unchanged, ., So, fa..."


### 길이가 3이하인 단어에 대해서 제거

In [14]:
tokenized_doc = text['text'].apply(lambda x : [word for word in x if len(word) > 3])
tokenized_doc[:5]

0    [think, charm, young, ladies, ever, meet, migh...
1    [reply, sudden, consciousness, find, ignorant,...
2    [lady, state, intention, scream, course, would...
3    [suddenly, silence, hear, sound, send, heart, ...
4    [conviction, remain, unchanged, know, believe,...
Name: text, dtype: object

In [15]:
# 문서-단어 행렬 만들기
# 어휘(vocabulary) 학습
dictionary = corpora.Dictionary(tokenized_doc)
# 문서-단어 행렬(document-term matrix) 생성
corpus = [dictionary.doc2bow(text) for text in tokenized_doc] 

In [16]:
print(dictionary)

Dictionary(34776 unique tokens: ['Agra', 'bias', 'charm', 'cold', 'decide']...)


In [17]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
corpus_tfidf[0][:5]

[(0, 0.26043168824057805),
 (1, 0.25416882561971893),
 (2, 0.167638194900941),
 (3, 0.1395015977136251),
 (4, 0.1563446487190722)]

In [18]:
model = models.ldamodel.LdaModel(corpus_tfidf, num_topics=5, id2word=dictionary)

In [19]:
model.show_topic(3, 10)

[('motive', 0.0013802442),
 ('Please', 0.001242784),
 ('Uriah', 0.0011339529),
 ('aboard', 0.0010800624),
 ('indicate', 0.0010660827),
 ('impatiently', 0.0010268603),
 ('Finally', 0.0009947383),
 ('actual', 0.0009931885),
 ('vessel', 0.00097762),
 ('Bill', 0.0009770888)]

In [20]:
# 토픽 개수, 키워드 개수를 정해주는 변수를 추가.
NUM_TOPICS = 5

NUM_TOPIC_WORDS = 200


def build_doc_term_mat(documents):
    # 문서-단어 행렬 만들어주는 함수.
    print("Building document-term matrix.")
    dictionary = corpora.Dictionary(documents)
    corpus = [dictionary.doc2bow(document) for document in documents]
        
    return corpus, dictionary


def print_topic_words(model):

    # 토픽 모델링 결과를 출력해 주는 함수.
    print("\nPrinting topic words.\n")
    
    for topic_id in range(model.num_topics):
        topic_word_probs = model.show_topic(topic_id, NUM_TOPIC_WORDS)
        print("Topic ID: {}".format(topic_id))
        
        for topic_word, prob in topic_word_probs:
            print("\t{}\t{}".format(topic_word, prob))
            
        print("\n")

# document-term matrix를 만들고,
corpus, dictionary = build_doc_term_mat(tokenized_doc)
# LDA를 실행.
model = models.ldamodel.LdaModel(corpus, num_topics=NUM_TOPICS, id2word=dictionary, alpha="auto", eta="auto")
# 결과를 출력.
print_topic_words(model)

Building document-term matrix.

Printing topic words.

Topic ID: 0
	odin	0.04417572543025017
	know	0.012275299988687038
	would	0.011287634260952473
	come	0.009938683360815048
	make	0.00877386424690485
	must	0.00683877058327198
	tell	0.006779077462852001
	give	0.006038909312337637
	could	0.006016036029905081
	much	0.00571388378739357
	think	0.005710894241929054
	good	0.005630532745271921
	time	0.005585775710642338
	take	0.005273733288049698
	little	0.004628575406968594
	never	0.004495830275118351
	might	0.004267455544322729
	shall	0.0041183121502399445
	even	0.0038711214438080788
	like	0.003641051473096013
	mean	0.0035805313382297754
	well	0.0034875015262514353
	leave	0.0034695512149482965
	That	0.0032037701457738876
	money	0.003161228494718671
	live	0.003155570710077882
	upon	0.0031448870431631804
	nothing	0.0031346366740763187
	dear	0.0031223732512444258
	love	0.003038523020222783
	matter	0.0030250074341893196
	What	0.0029830678831785917
	write	0.0028805031906813383
	life	0.0028318769

In [21]:
# pyLDAvis 불러오기
import pyLDAvis
import pyLDAvis.gensim

# pyLDAvis를 jupyter notebook에서 실행할 수 있게 활성화.
pyLDAvis.enable_notebook()

# pyLDAvis 실행.
data = pyLDAvis.gensim.prepare(model, corpus, dictionary)
data

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [22]:
pyLDAvis.save_html(data, 'lda_test.html')

  and should_run_async(code)


## TF-IDF 행렬 만들기

In [23]:
# 역토큰화
detokenized_doc = []
for i in range(len(text)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)
text['text'] = detokenized_doc
# 다시 text['text'] 에 저장
text.head()

  and should_run_async(code)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,text
0,think charm young ladies ever meet might usefu...
1,reply sudden consciousness find ignorant goodn...
2,lady state intention scream course would screa...
3,suddenly silence hear sound send heart mouth c...
4,conviction remain unchanged know believe hones...


### TfidfVectorizer를 통해 단어 1,000개에 대한 TF-IDF 행렬 만들기

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(text['text'])

  and should_run_async(code)


In [25]:
X.shape

  and should_run_async(code)


(19617, 1000)

## 토픽 모델링(LDA)

In [26]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model = LatentDirichletAllocation(n_components=5, learning_method='online', random_state=13, max_iter=1)
lda_top = lda_model.fit_transform(X)

  and should_run_async(code)


In [27]:
print(lda_model.components_)

[[10.56292863  0.3093624   0.20242165 ... 21.90279393  0.20296501
   0.21164251]
 [ 2.71313822  0.20944058  0.20265985 ... 33.19978801  0.47370812
  15.23370154]
 [50.90547192  0.20833368  0.20994325 ... 56.18200339  0.20328001
   8.412938  ]
 [29.01865329 38.22780529 23.08938056 ... 95.02342192 26.9411248
  20.07547467]
 [ 0.20981106  0.20565253  2.492095   ... 22.10171455  0.2136256
   4.61062002]]


  and should_run_async(code)


In [28]:
lda_model.components_.shape

  and should_run_async(code)


(5, 1000)

### 단어 집합, 1,000개의 단어가 저장되어있음.

In [29]:
terms = vectorizer.get_feature_names()

  and should_run_async(code)


In [30]:
len(terms), terms

  and should_run_async(code)


(1000,
 ['able',
  'abroad',
  'absence',
  'absolutely',
  'accept',
  'accompany',
  'account',
  'acquaintance',
  'action',
  'actually',
  'address',
  'admiration',
  'admire',
  'admit',
  'advance',
  'advantage',
  'advice',
  'affair',
  'affairs',
  'affect',
  'affection',
  'afraid',
  'afternoon',
  'agree',
  'agreeable',
  'alarm',
  'alive',
  'allow',
  'aloud',
  'altogether',
  'amuse',
  'anger',
  'angry',
  'anne',
  'announce',
  'answer',
  'anxiety',
  'anxious',
  'anybody',
  'appear',
  'appearance',
  'approach',
  'arrange',
  'arrival',
  'arrive',
  'article',
  'ashamed',
  'aside',
  'asleep',
  'assure',
  'astonish',
  'attachment',
  'attack',
  'attempt',
  'attend',
  'attention',
  'aunt',
  'avoid',
  'awake',
  'aware',
  'away',
  'ball',
  'bank',
  'bare',
  'bath',
  'bear',
  'beat',
  'beautiful',
  'beauty',
  'begin',
  'behaviour',
  'believe',
  'bell',
  'belong',
  'bend',
  'best',
  'better',
  'bind',
  'bird',
  'bite',
  'blac

In [31]:
def get_authors(components, feature_names, n=5):
    for idx, author in enumerate(components):
        print("Author %d :" % (idx+1), [(feature_names[i], author[i].round(2)) for i in author.argsort()[:-n -1:-1]])

  and should_run_async(code)


In [32]:
get_authors(lda_model.components_, terms)

Author 1 : [('odin', 216.49), ('come', 102.67), ('hand', 98.3), ('look', 87.36), ('face', 81.82)]
Author 2 : [('odin', 145.44), ('touch', 87.47), ('know', 81.39), ('like', 79.88), ('hand', 77.82)]
Author 3 : [('odin', 656.87), ('know', 310.8), ('think', 257.63), ('make', 207.06), ('tell', 182.5)]
Author 4 : [('odin', 451.33), ('know', 145.56), ('time', 140.88), ('think', 138.19), ('come', 128.32)]
Author 5 : [('odin', 253.07), ('door', 212.93), ('room', 183.41), ('look', 168.77), ('come', 155.33)]


  and should_run_async(code)


## Max_features 제한 없이

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(text['text'])

  and should_run_async(code)


In [34]:
X.shape

  and should_run_async(code)


(19617, 26020)

In [35]:
terms = vectorizer.get_feature_names()
len(terms), terms

  and should_run_async(code)


(26020,
 ['000',
  '10',
  '1000',
  '108',
  '11',
  '117th',
  '11th',
  '126b',
  '13th',
  '14',
  '1413',
  '1428',
  '14th',
  '15',
  '1543',
  '1607',
  '1642',
  '1644',
  '16_th_',
  '17',
  '1730',
  '1750',
  '1757',
  '1760',
  '1767',
  '1784',
  '1785',
  '1787',
  '1789',
  '1791',
  '17th',
  '1800',
  '1806',
  '1812',
  '1820',
  '1846',
  '1847',
  '1848',
  '1855',
  '1856',
  '1857',
  '1858',
  '1859',
  '1863',
  '1869',
  '1871',
  '1875',
  '1878',
  '1882',
  '1883',
  '1884',
  '1890',
  '1891',
  '1892',
  '1893',
  '1894',
  '1897',
  '18th',
  '19th',
  '20',
  '200',
  '21st',
  '221_b_',
  '221b',
  '22nd',
  '23rd',
  '24th',
  '26_s_',
  '2704',
  '27th',
  '28th',
  '30',
  '300',
  '35',
  '36',
  '40',
  '4700',
  '4_d_',
  '50',
  '500',
  '577',
  '750',
  '89',
  '_absolutely_',
  '_accepted_',
  '_accoucheur_',
  '_accoucheuses_',
  '_activité',
  '_addition',
  '_addition_',
  '_affaire',
  '_afraid_',
  '_after',
  '_after_',
  '_against',
  

In [36]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model = LatentDirichletAllocation(n_components=5, learning_method='online', random_state=13, max_iter=1)
lda_top = lda_model.fit_transform(X)

  and should_run_async(code)


In [37]:
print(lda_model.components_)

[[0.20390786 0.20624183 0.63886675 ... 0.2008523  0.20017253 0.20027221]
 [0.20057637 0.20609255 0.20460835 ... 0.2010781  0.21138658 0.20023362]
 [0.20089627 0.20700593 0.20171327 ... 1.21979976 0.20016407 0.20024277]
 [0.25315428 0.35582565 0.38050543 ... 0.20402712 0.20011362 0.21605243]
 [1.76226759 1.2533156  0.28591674 ... 0.2111821  0.21891934 0.20993191]]


  and should_run_async(code)


In [38]:
lda_model.components_.shape

  and should_run_async(code)


(5, 26020)

In [39]:
terms = vectorizer.get_feature_names()

  and should_run_async(code)


In [40]:
def get_authors(components, feature_names, n=10):
    for idx, author in enumerate(components):
        print("Author %d :" % (idx+1), [(feature_names[i], author[i].round(2)) for i in author.argsort()[:-n -1:-1]])

  and should_run_async(code)


In [41]:
get_authors(lda_model.components_, terms)

Author 1 : [('member', 7.19), ('cheat', 6.14), ('burger', 5.11), ('hast', 4.95), ('horace', 4.85), ('distort', 4.72), ('didst', 4.44), ('jerry', 4.13), ('halloa', 4.02), ('verandah', 3.83)]
Author 2 : [('virtues', 9.21), ('patients', 6.79), ('mourn', 6.62), ('hatch', 6.6), ('pudding', 5.9), ('spoon', 5.3), ('bridegroom', 4.81), ('loch', 4.4), ('heights', 4.01), ('plat', 3.85)]
Author 3 : [('guilty', 15.62), ('blunder', 10.23), ('lads', 9.81), ('martha', 7.15), ('falter', 6.34), ('hilt', 6.32), ('rascals', 5.31), ('troth', 5.27), ('villon', 5.08), ('robe', 4.96)]
Author 4 : [('witness', 29.28), ('affair', 28.65), ('coach', 26.87), ('lucy', 25.01), ('amaze', 24.46), ('professor', 22.33), ('sympathy', 22.19), ('relations', 21.73), ('report', 20.88), ('cart', 20.5)]
Author 5 : [('odin', 1134.72), ('know', 428.55), ('come', 405.29), ('think', 386.36), ('make', 341.36), ('look', 312.82), ('time', 312.81), ('like', 265.77), ('hand', 258.54), ('tell', 256.74)]


  and should_run_async(code)
