In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import nltk

## 데이터 불러오기

In [24]:
# 파일 불러오기

train = pd.read_csv('./open/train.csv', encoding='utf-8')
test_x = pd.read_csv('./open/test_x.csv', encoding='utf-8')
submission = pd.read_csv('./open/sample_submission.csv', encoding='utf-8')

In [4]:
X = train.loc[:, 'text']
y = train.loc[:, 'author']

In [5]:
y.unique()

array([3, 2, 1, 4, 0])

In [7]:
count_vect = CountVectorizer(stop_words='english')
feat_vect = count_vect.fit_transform(X)

#### LDA 객체 생성 후 Count 피처 벡터화 객체로 LDA수행

In [8]:
lda = LatentDirichletAllocation(n_components=5, random_state=13)
lda.fit(feat_vect)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=13, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [9]:
print(lda.components_.shape)
lda.components_

(5, 34416)


array([[3.66522832, 0.22017879, 0.23746337, ..., 1.18686389, 0.200005  ,
        0.20000261],
       [0.24296878, 0.20964094, 0.20343087, ..., 0.21446174, 0.20857241,
        0.20001067],
       [0.2015402 , 0.20043959, 0.20045399, ..., 0.20186388, 0.2576835 ,
        1.1957053 ],
       [2.6868477 , 7.16653139, 0.2018153 , ..., 0.20000465, 0.20046145,
        0.20058243],
       [0.203415  , 0.20320929, 4.15683647, ..., 2.19680583, 1.13327764,
        0.20369898]])

In [11]:
count_vect.get_feature_names()

['000',
 '10',
 '100',
 '1000',
 '109',
 '10_s_',
 '11',
 '114th',
 '117',
 '12',
 '120',
 '126b',
 '127',
 '129',
 '12_s_',
 '12th',
 '13',
 '13th',
 '14',
 '140',
 '1429',
 '1456',
 '146m',
 '14th',
 '15',
 '150',
 '15_th',
 '15th',
 '15º',
 '16',
 '1647',
 '1676',
 '16a',
 '16th',
 '17',
 '171',
 '1715',
 '1733',
 '1742',
 '1745',
 '1748',
 '1749',
 '1750',
 '1751',
 '1756',
 '1757',
 '1764',
 '1767',
 '1772',
 '1792',
 '17__',
 '17_th_',
 '18',
 '1803',
 '1810',
 '1812',
 '1814',
 '1820',
 '1826',
 '1830',
 '1840',
 '1855',
 '1856',
 '1859',
 '1860',
 '1861',
 '1862',
 '1865',
 '1869',
 '1870',
 '1874',
 '1875',
 '1876',
 '1878',
 '1882',
 '1883',
 '1884',
 '1887',
 '1888',
 '1890',
 '1891',
 '1894',
 '1895',
 '1898',
 '18th',
 '19',
 '1908',
 '1914',
 '19o',
 '1_s_',
 '1st',
 '20',
 '200',
 '21',
 '21st',
 '22',
 '220',
 '221b',
 '22nd',
 '23',
 '23l',
 '23rd',
 '24',
 '247',
 '2473',
 '249',
 '25',
 '250',
 '25º',
 '26',
 '26th',
 '27',
 '270',
 '2704',
 '28',
 '28th',
 '29',
 '2

## 텍스트 전처리

In [25]:
train.head()

Unnamed: 0,index,text,author
0,0,"He was almost choking. There was so much, so m...",3
1,1,"“Your sister asked for it, I suppose?”",2
2,2,"She was engaged one day as she walked, in per...",1
3,3,"The captain was in the porch, keeping himself ...",4
4,4,"“Have mercy, gentlemen!” odin flung up his han...",3


In [26]:
text = train[['text']]
text.head()

Unnamed: 0,text
0,"He was almost choking. There was so much, so m..."
1,"“Your sister asked for it, I suppose?”"
2,"She was engaged one day as she walked, in per..."
3,"The captain was in the porch, keeping himself ..."
4,"“Have mercy, gentlemen!” odin flung up his han..."


### word_tokenize 진행

In [27]:
text['text'] = text.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)
text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,text
0,"[He, was, almost, choking, ., There, was, so, ..."
1,"[“, Your, sister, asked, for, it, ,, I, suppos..."
2,"[She, was, engaged, one, day, as, she, walked,..."
3,"[The, captain, was, in, the, porch, ,, keeping..."
4,"[“, Have, mercy, ,, gentlemen, !, ”, odin, flu..."


### 불용어 처리

In [28]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
text['text'] = text['text'].apply(lambda x: [word for word in x if word not in (stop)])
text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,text
0,"[He, almost, choking, ., There, much, ,, much,..."
1,"[“, Your, sister, asked, ,, I, suppose, ?, ”]"
2,"[She, engaged, one, day, walked, ,, perusing, ..."
3,"[The, captain, porch, ,, keeping, carefully, w..."
4,"[“, Have, mercy, ,, gentlemen, !, ”, odin, flu..."


### 표제어 추출로 3인칭 단수 표현을 1인칭으로 바꾸고, 과거 현재형 동사를 현재형으로 바꿈

In [35]:
from nltk.stem import WordNetLemmatizer
text['text'] = text['text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])
text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,text
0,"[He, almost, choke, ., There, much, ,, much, w..."
1,"[“, Your, sister, ask, ,, I, suppose, ?, ”]"
2,"[She, engage, one, day, walk, ,, peruse, Jane,..."
3,"[The, captain, porch, ,, keep, carefully, way,..."
4,"[“, Have, mercy, ,, gentlemen, !, ”, odin, fli..."


### 길이가 3이하인 단어에 대해서 제거

In [37]:
tokenized_doc = text['text'].apply(lambda x : [word for word in x if len(word) > 3])
tokenized_doc[:5]

0    [almost, choke, There, much, much, want, stran...
1                              [Your, sister, suppose]
2    [engage, walk, peruse, Jane, last, letter, dwe...
3    [captain, porch, keep, carefully, treacherous,...
4    [Have, mercy, gentlemen, odin, fling, hand, wr...
Name: text, dtype: object

## TF-IDF 행렬 만들기

In [38]:
# 역토큰화
detokenized_doc = []
for i in range(len(text)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)
text['text'] = detokenized_doc
# 다시 text['text'] 에 저장
text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,text
0,almost choke There much much want strange excl...
1,Your sister suppose
2,engage walk peruse Jane last letter dwell pass...
3,captain porch keep carefully treacherous shoot...
4,Have mercy gentlemen odin fling hand write any...


### TfidfVectorizer를 통해 단어 1,000개에 대한 TF-IDF 행렬 만들기

In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(text['text'])

In [63]:
X.shape

(54879, 1000)

## 토픽 모델링(LDA)

In [64]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model = LatentDirichletAllocation(n_components=5, learning_method='online', random_state=13, max_iter=1)
lda_top = lda_model.fit_transform(X)

In [65]:
print(lda_model.components_)

[[  0.20375636   0.20392265   0.20428166 ...   1.14980745 170.83048358
    0.20754686]
 [118.84436233  40.6666545    0.20861991 ... 128.76185146   0.20951396
    0.21306149]
 [  0.2041357    0.20410066  77.53765712 ...   1.7497157   92.49617708
    0.20769866]
 [  7.46251627   0.20222935   0.20548765 ...   0.20296145 118.04304008
   47.27492699]
 [  0.2918131    0.2042045    0.20445965 ...   0.23437028  51.78250206
    0.20397706]]


In [66]:
lda_model.components_.shape

(5, 1000)

### 단어 집합, 1,000개의 단어가 저장되어있음.

In [67]:
terms = vectorizer.get_feature_names()

In [70]:
len(terms), terms

(1000,
 ['able',
  'abroad',
  'absolutely',
  'accept',
  'accompany',
  'account',
  'acquaintance',
  'action',
  'actually',
  'address',
  'admire',
  'admit',
  'advance',
  'advantage',
  'adventure',
  'advice',
  'affair',
  'affairs',
  'affect',
  'affection',
  'afraid',
  'afternoon',
  'agree',
  'agreeable',
  'alarm',
  'alive',
  'allow',
  'aloud',
  'altogether',
  'amazement',
  'amuse',
  'anger',
  'angry',
  'anne',
  'announce',
  'answer',
  'anxiety',
  'anxious',
  'anybody',
  'apparently',
  'appear',
  'appearance',
  'approach',
  'arrange',
  'arrest',
  'arrival',
  'arrive',
  'article',
  'ashamed',
  'aside',
  'asleep',
  'assure',
  'astonishment',
  'attack',
  'attempt',
  'attend',
  'attention',
  'aunt',
  'avoid',
  'aware',
  'away',
  'ball',
  'bank',
  'bath',
  'bear',
  'beat',
  'beautiful',
  'beauty',
  'begin',
  'behaviour',
  'behold',
  'believe',
  'bell',
  'belong',
  'bend',
  'best',
  'better',
  'bind',
  'bird',
  'bite',

In [47]:
def get_authors(components, feature_names, n=5):
    for idx, author in enumerate(components):
        print("Author %d :" % (idx+1), [(feature_names[i], author[i].round(2)) for i in author.argsort()[:-n -1:-1]])

In [48]:
get_authors(lda_model.components_, terms)

Author 1 : [('answer', 714.33), ('odin', 488.61), ('father', 426.84), ('miss', 292.31), ('poor', 263.17)]
Author 2 : [('odin', 1137.78), ('good', 547.16), ('make', 536.15), ('time', 477.73), ('mean', 476.11)]
Author 3 : [('odin', 746.35), ('look', 466.62), ('laugh', 402.7), ('shall', 371.47), ('face', 355.08)]
Author 4 : [('odin', 686.84), ('reply', 443.68), ('hand', 429.69), ('voice', 380.91), ('head', 355.9)]
Author 5 : [('odin', 2241.94), ('know', 689.76), ('think', 682.12), ('come', 562.1), ('right', 462.01)]


## Max_features 제한 없이

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(text['text'])

In [50]:
X.shape

(54879, 28005)

In [61]:
terms = vectorizer.get_feature_names()
len(terms), terms

(28005,
 ['000',
  '10',
  '100',
  '1000',
  '10_s_',
  '11',
  '114th',
  '126b',
  '12_s_',
  '12th',
  '13th',
  '14',
  '1429',
  '1456',
  '146m',
  '14th',
  '15',
  '15_th',
  '15th',
  '1647',
  '1676',
  '16th',
  '1715',
  '1733',
  '1742',
  '1745',
  '1748',
  '1749',
  '1750',
  '1751',
  '1756',
  '1757',
  '1764',
  '1767',
  '1772',
  '1792',
  '17__',
  '17_th_',
  '1803',
  '1810',
  '1812',
  '1814',
  '1820',
  '1826',
  '1830',
  '1840',
  '1855',
  '1856',
  '1859',
  '1860',
  '1861',
  '1862',
  '1865',
  '1869',
  '1870',
  '1874',
  '1875',
  '1876',
  '1878',
  '1882',
  '1883',
  '1884',
  '1887',
  '1888',
  '1890',
  '1891',
  '1894',
  '1895',
  '1898',
  '18th',
  '1908',
  '1914',
  '1_s_',
  '20',
  '200',
  '21st',
  '221b',
  '22nd',
  '23rd',
  '2473',
  '249',
  '26th',
  '2704',
  '28th',
  '29th',
  '2_s_',
  '2d',
  '30',
  '303',
  '34th',
  '35',
  '37',
  '40',
  '4000',
  '421',
  '470',
  '50',
  '500',
  '52',
  '6_d_',
  '6th',
  '7000l'

In [51]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model = LatentDirichletAllocation(n_components=5, learning_method='online', random_state=13, max_iter=1)
lda_top = lda_model.fit_transform(X)

In [52]:
print(lda_model.components_)

[[0.22398007 0.23889574 0.84701472 ... 0.20055412 0.20050273 0.20000566]
 [0.20174918 0.20200475 0.26753109 ... 0.20123267 0.20059365 0.20000062]
 [3.23702477 0.65242872 0.34478596 ... 0.34498558 0.20066732 0.2000005 ]
 [0.20160278 0.20292731 0.49657982 ... 0.20218075 0.20071678 0.20002565]
 [0.32059614 0.43018788 0.31937211 ... 0.2003253  0.20000534 0.20000059]]


In [53]:
lda_model.components_.shape

(5, 28005)

In [54]:
terms = vectorizer.get_feature_names()

In [57]:
def get_authors(components, feature_names, n=10):
    for idx, author in enumerate(components):
        print("Author %d :" % (idx+1), [(feature_names[i], author[i].round(2)) for i in author.argsort()[:-n -1:-1]])

In [58]:
get_authors(lda_model.components_, terms)

Author 1 : [('odin', 3219.23), ('know', 1003.03), ('come', 1000.3), ('think', 883.42), ('look', 771.44), ('make', 658.49), ('like', 617.74), ('time', 582.27), ('hand', 571.71), ('good', 561.44)]
Author 2 : [('odin', 175.11), ('mean', 139.87), ('address', 118.3), ('ivan', 111.26), ('cross', 105.86), ('paper', 100.44), ('surely', 96.82), ('certain', 92.19), ('angry', 91.26), ('assure', 89.74)]
Author 3 : [('exclaim', 115.01), ('dora', 100.46), ('tell', 78.63), ('pray', 66.61), ('knight', 58.42), ('pleasant', 57.08), ('squire', 53.57), ('demand', 53.06), ('hush', 49.24), ('amazement', 48.81)]
Author 4 : [('inquire', 159.2), ('horse', 96.9), ('guess', 79.6), ('shoot', 69.23), ('bend', 63.21), ('black', 56.3), ('professor', 54.76), ('pardon', 53.58), ('ride', 49.7), ('bird', 48.85)]
Author 5 : [('pretty', 128.54), ('truth', 115.12), ('kill', 96.74), ('shout', 96.63), ('water', 85.37), ('pull', 76.1), ('clerk', 67.57), ('small', 66.22), ('just', 65.83), ('odin', 62.34)]
