In [1]:
import pandas as pd
import warnings 
warnings.filterwarnings(action='ignore')
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from nltk import word_tokenize
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline

## 데이터 확인

In [2]:
# 파일 불러오기

train = pd.read_csv('./open/train.csv', encoding='utf-8')
test_x = pd.read_csv('./open/test_x.csv', encoding='utf-8')
submission = pd.read_csv('./open/sample_submission.csv', encoding='utf-8')

### train data
- 실제 작가와 작가가 작성한 Text가 라벨링된 데이터프레임

In [3]:
train

Unnamed: 0,index,text,author
0,0,"He was almost choking. There was so much, so m...",3
1,1,"“Your sister asked for it, I suppose?”",2
2,2,"She was engaged one day as she walked, in per...",1
3,3,"The captain was in the porch, keeping himself ...",4
4,4,"“Have mercy, gentlemen!” odin flung up his han...",3
...,...,...,...
54874,54874,"“Is that you, Mr. Smith?” odin whispered. “I h...",2
54875,54875,"I told my plan to the captain, and between us ...",4
54876,54876,"""Your sincere well-wisher, friend, and sister...",1
54877,54877,“Then you wanted me to lend you money?”,3


In [4]:
train.author.unique(), train.author.value_counts()

(array([3, 2, 1, 4, 0]),
 3    15063
 0    13235
 2    11554
 4     7805
 1     7222
 Name: author, dtype: int64)

### text_x
- train data를 학습시켜 어떤 작가가 작성했는지 분석해야할 데이터프레임

In [5]:
test_x

Unnamed: 0,index,text
0,0,“Not at all. I think she is one of the most ch...
1,1,"""No,"" replied he, with sudden consciousness, ""..."
2,2,As the lady had stated her intention of scream...
3,3,“And then suddenly in the silence I heard a so...
4,4,His conviction remained unchanged. So far as I...
...,...,...
19612,19612,"At the end of another day or two, odin growing..."
19613,19613,"All afternoon we sat together, mostly in silen..."
19614,19614,"odin, having carried his thanks to odin, proc..."
19615,19615,"Soon after this, upon odin's leaving the room,..."


### submission
- 제출 형태
- 가로 index : 작가 명

In [6]:
submission

Unnamed: 0,index,0,1,2,3,4
0,0,0,0,0,0,0
1,1,0,0,0,0,0
2,2,0,0,0,0,0
3,3,0,0,0,0,0
4,4,0,0,0,0,0
...,...,...,...,...,...,...
19612,19612,0,0,0,0,0
19613,19613,0,0,0,0,0
19614,19614,0,0,0,0,0
19615,19615,0,0,0,0,0


## 전처리

### train_test_split

In [22]:
X = train.loc[:, 'text']
y = train.loc[:, 'author']

In [31]:
X

0        He was almost choking. There was so much, so m...
1                   “Your sister asked for it, I suppose?”
2         She was engaged one day as she walked, in per...
3        The captain was in the porch, keeping himself ...
4        “Have mercy, gentlemen!” odin flung up his han...
                               ...                        
54874    “Is that you, Mr. Smith?” odin whispered. “I h...
54875    I told my plan to the captain, and between us ...
54876     "Your sincere well-wisher, friend, and sister...
54877              “Then you wanted me to lend you money?”
54878    It certainly had not occurred to me before, bu...
Name: text, Length: 54879, dtype: object

#### 정규표현식

In [25]:
# 길이가 1~2인 단어들을 정규 표현식을 이용하여 삭제
import re
text = "I was wondering if anyone out there could enlighten me on this car. There was so much, so much he wanted to say"
shortword = re.compile(r'\W*\b\w{1,3}\b')
print(shortword.sub('',text))

 wondering anyone there could enlighten this. There much much wanted


In [23]:
X[0]

'He was almost choking. There was so much, so much he wanted to say, but strange exclamations were all that came from his lips. The Pole gazed fixedly at him, at the bundle of notes in his hand; looked at odin, and was in evident perplexity.'

In [26]:
print(shortword.sub('', X[0]))

 almost choking. There much much wanted strange exclamations were that came from lips Pole gazed fixedly bundle notes hand; looked odin evident perplexity.


In [33]:
re_X = []
for i in range(len(X)):
    t = shortword.sub('', X[i])
    re_X.append(t)

In [35]:
X = pd.Series(re_X)
X

0         almost choking. There much much wanted strang...
1                             “Your sister asked suppose?”
2         engaged walked perusing Jane last letter dwel...
3         captain porch, keeping himself carefully trea...
4        “Have mercy, gentlemen!” odin flung hands writ...
                               ...                        
54874     that. Smith?” odin whispered hardly dared hop...
54875     told plan captain between settled details acc...
54876     "Your sincere well-wisher, friend sister, "LU...
54877                            “Then wanted lend money?”
54878     certainly occurred before said should like that.
Length: 54879, dtype: object

In [8]:
#####

In [9]:
X = train[['text']]
X.head()

Unnamed: 0,text
0,"He was almost choking. There was so much, so m..."
1,"“Your sister asked for it, I suppose?”"
2,"She was engaged one day as she walked, in per..."
3,"The captain was in the porch, keeping himself ..."
4,"“Have mercy, gentlemen!” odin flung up his han..."


In [10]:
X['text'] = X.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)
X.head()

Unnamed: 0,text
0,"[He, was, almost, choking, ., There, was, so, ..."
1,"[“, Your, sister, asked, for, it, ,, I, suppos..."
2,"[She, was, engaged, one, day, as, she, walked,..."
3,"[The, captain, was, in, the, porch, ,, keeping..."
4,"[“, Have, mercy, ,, gentlemen, !, ”, odin, flu..."


In [11]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
X['text'] = X['text'].apply(lambda x: [word for word in x if word not in (stop)])
X.head()

Unnamed: 0,text
0,"[He, almost, choking, ., There, much, ,, much,..."
1,"[“, Your, sister, asked, ,, I, suppose, ?, ”]"
2,"[She, engaged, one, day, walked, ,, perusing, ..."
3,"[The, captain, porch, ,, keeping, carefully, w..."
4,"[“, Have, mercy, ,, gentlemen, !, ”, odin, flu..."


In [12]:
from nltk.stem import WordNetLemmatizer
X['text'] = X['text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])
X.head()

Unnamed: 0,text
0,"[He, almost, choke, ., There, much, ,, much, w..."
1,"[“, Your, sister, ask, ,, I, suppose, ?, ”]"
2,"[She, engage, one, day, walk, ,, peruse, Jane,..."
3,"[The, captain, porch, ,, keep, carefully, way,..."
4,"[“, Have, mercy, ,, gentlemen, !, ”, odin, fli..."


In [13]:
tokenized_doc = X['text'].apply(lambda x : [word for word in x if len(word) > 3])
tokenized_doc[:5]

0    [almost, choke, There, much, much, want, stran...
1                              [Your, sister, suppose]
2    [engage, walk, peruse, Jane, last, letter, dwe...
3    [captain, porch, keep, carefully, treacherous,...
4    [Have, mercy, gentlemen, odin, fling, hand, wr...
Name: text, dtype: object

In [14]:
# 역토큰화
detokenized_doc = []
for i in range(len(X)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)
X['text'] = detokenized_doc
# 다시 X['text'] 에 저장
X.head()

Unnamed: 0,text
0,almost choke There much much want strange excl...
1,Your sister suppose
2,engage walk peruse Jane last letter dwell pass...
3,captain porch keep carefully treacherous shoot...
4,Have mercy gentlemen odin fling hand write any...


In [60]:
#####

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

In [39]:
X_train.shape, X_test.shape

((43903,), (10976,))

### 문장 토큰화

In [46]:
from nltk.tokenize import sent_tokenize

print(sent_tokenize(X_train[0]))

['He was almost choking.', 'There was so much, so much he wanted to say, but strange exclamations were all that came from his lips.', 'The Pole gazed fixedly at him, at the bundle of notes in his hand; looked at odin, and was in evident perplexity.']


### 벡터라이즈

#### countvectorizer

In [40]:
from sklearn.feature_extraction.text import CountVectorizer

# Count Vectorization으로 feature extraction 변환 수행. 
cnt_vect = CountVectorizer()
cnt_vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [41]:
X_train_cnt_vect = cnt_vect.transform(X_train)
X_test_cnt_vect = cnt_vect.transform(X_test)
print('X_train의 CountVectorizer Shape:', X_train_cnt_vect.shape, X_test_cnt_vect.shape)

X_train의 CountVectorizer Shape: (43903, 31474) (10976, 31474)


In [42]:
print(cnt_vect.vocabulary_)



In [43]:
len(cnt_vect.vocabulary_)

31474

In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# LogisticRegression을 이용하여 학습/예측/평가 수행.
lr_clf = LogisticRegression()
lr_clf.fit(X_train_cnt_vect, y_train)
pred = lr_clf.predict(X_test_cnt_vect)
print('CountVectorized Logistic Regression의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, pred)))

CountVectorized Logistic Regression의 예측 정확도는 0.707


##### stop_words, 'english' 추가

In [45]:
cnt_vect = CountVectorizer(max_features=2000, stop_words='english')
cnt_vect.fit(X_train)

X_train_cnt_vect = cnt_vect.transform(X_train)
X_test_cnt_vect = cnt_vect.transform(X_test)

lr_clf = LogisticRegression()
lr_clf.fit(X_train_cnt_vect, y_train)
pred = lr_clf.predict(X_test_cnt_vect)
print('CountVectorized Logistic Regression의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, pred)))

CountVectorized Logistic Regression의 예측 정확도는 0.611


In [46]:
# n gram 확인 -- 노트북 사양으로 확인 불가

In [22]:
cnt_vect = CountVectorizer(ngram_range=(2,2))
cnt_vect.fit(X_train)
ftr_vect = cnt_vect.transform(X_train)
print(type(ftr_vect), ftr_vect.shape)
print(cnt_vect.vocabulary_)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



#### tfidfvectorizer

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# TF-IDF Vectorization 적용하여 학습 데이터셋과 테스트 데이터 셋 변환.
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

# LogisticRegression을 이용하여 학습/예측/평가 수행.
lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
print('TF-IDF Logistic Regression의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, pred)))

TF-IDF Logistic Regression의 예측 정확도는 0.709


In [48]:
tfidf_X = tfidf_vect.transform(X_train)
tfidf_X

<43903x31474 sparse matrix of type '<class 'numpy.float64'>'
	with 887526 stored elements in Compressed Sparse Row format>

In [49]:
print(type(tfidf_X), tfidf_X.shape)

<class 'scipy.sparse.csr.csr_matrix'> (43903, 31474)


In [50]:
tfidf_vect.vocabulary_

{'odin': 19154,
 'said': 23992,
 'stepping': 26518,
 'into': 15147,
 'room': 23727,
 'handing': 13073,
 'friend': 11794,
 'letter': 16361,
 'soon': 25868,
 'after': 1244,
 'this': 27978,
 'miss': 17943,
 'came': 4524,
 'could': 6785,
 'help': 13442,
 'being': 3090,
 'diverted': 8767,
 'perplexity': 20418,
 'first': 11116,
 'answer': 1745,
 'herself': 13515,
 'resulting': 23318,
 'supposed': 27177,
 'from': 11835,
 'doubt': 8950,
 'what': 30752,
 'might': 17737,
 'impatience': 14244,
 'every': 10180,
 'thing': 27947,
 'well': 30702,
 'then': 27887,
 'only': 19310,
 'meant': 17469,
 'that': 27857,
 'your': 31409,
 'attributing': 2390,
 'brother': 4159,
 'wish': 31055,
 'dancing': 7414,
 'with': 31074,
 'good': 12487,
 'nature': 18560,
 'alone': 1474,
 'convinced': 6599,
 'superior': 27128,
 'yourself': 31413,
 'rest': 23288,
 'world': 31187,
 'gradually': 12577,
 'withdrew': 31081,
 'eyes': 10557,
 'turned': 28884,
 'them': 27883,
 'fire': 11092,
 'watching': 30553,
 'appeared': 1867,
 '

##### stop words 필터링을 추가하고 ngram을 기본 (1,1)에서 (1,2)로 변경하여 피처 벡터화

In [51]:
tfidf_vect = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
print('TF-IDF Vectorized Logistic Regression 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test ,pred)))

TF-IDF Vectorized Logistic Regression 의 예측 정확도는 0.695


In [52]:
tfidf_vect.vocabulary_

{'odin': 299588,
 'said': 376552,
 'odin said': 304356,
 'stepping': 421120,
 'room': 371847,
 'handing': 191538,
 'friend': 168344,
 'letter': 246427,
 'said stepping': 378654,
 'stepping room': 421142,
 'room handing': 372203,
 'handing friend': 191546,
 'friend letter': 168665,
 'soon': 409254,
 'miss': 279474,
 'came': 55214,
 'help': 202097,
 'diverted': 117904,
 'perplexity': 322279,
 'answer': 16110,
 'resulting': 366081,
 'supposed': 433422,
 'doubt': 121211,
 'impatience': 215920,
 'thing': 445237,
 'soon miss': 409598,
 'miss odin': 279565,
 'odin came': 300386,
 'came odin': 55815,
 'odin help': 302261,
 'help diverted': 202195,
 'diverted perplexity': 117911,
 'perplexity answer': 322280,
 'answer resulting': 16366,
 'resulting supposed': 366085,
 'supposed doubt': 433461,
 'doubt said': 121548,
 'said impatience': 377653,
 'impatience thing': 215983,
 'meant': 273000,
 'attributing': 26821,
 'brother': 50159,
 'wish': 490953,
 'dancing': 99152,
 'good': 181000,
 'nature': 

##### min-df 조정

In [53]:
tfidf_vect = TfidfVectorizer(stop_words='english', max_df=10)
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
print('TF-IDF Vectorized Logistic Regression 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test ,pred)))

TF-IDF Vectorized Logistic Regression 의 예측 정확도는 0.439


In [54]:
tfidf_vect = TfidfVectorizer(stop_words='english', max_features=20000)
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
print('TF-IDF Vectorized Logistic Regression 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test ,pred)))

TF-IDF Vectorized Logistic Regression 의 예측 정확도는 0.699


##### GridSearchCV로 LogisticRegression C 하이퍼 파라미터 튜닝

In [55]:
from sklearn.model_selection import GridSearchCV

# 최적 C 값 도출 튜닝 수행. CV는 3 Fold셋으로 설정.
params = { 'C': [0.01, 0.1, 1, 5, 10]}
grid_cv_lr = GridSearchCV(lr_clf, param_grid=params, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_cv_lr.fit(X_train_tfidf_vect, y_train)
print('Logistic Regression best C parameter:', grid_cv_lr.best_params_)

# 최적 C 값으로 학습된 grid_cv로 예측 수행하고 정확도 평가
pred = grid_cv_lr.predict(X_test_tfidf_vect)
print('TF-IDF Vectorized Logistic Regression 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, pred)))

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed:   14.8s remaining:   13.0s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   15.7s finished


Logistic Regression best C parameter: {'C': 5}
TF-IDF Vectorized Logistic Regression 의 예측 정확도는 0.704


In [18]:
# 전체 말뭉치

all_words = set(word.lower() for sentence in X for word in word_tokenize(sentence))
all_words

{'westwhom',
 'abstracted',
 'numbered',
 'shunned',
 'canted',
 'venetians',
 'perverted',
 'impetuosity',
 'agitatedvery',
 'creatureyou',
 'presses',
 'otherthats',
 'jurymenfour',
 'shoving',
 'bun',
 'thunder',
 'customer',
 'affectionate',
 'yesratheri',
 'thepie',
 'ishes',
 'perceptive',
 'brothers',
 'quarrels',
 'selfsatisfaction',
 'model',
 'crystallisation',
 'holdenand',
 'attorney',
 'broughtup',
 'wasfor',
 'unkind',
 'alpinestock',
 'suharev',
 'marker',
 'greasewhich',
 'simpson',
 'realitya',
 'armchairs',
 'goodhearted',
 'badge',
 'vantageground',
 'girlwhat',
 'bounty',
 'wholehearted',
 'deputed',
 'potkins',
 'hereit',
 'creepers',
 'poora',
 'trailing',
 'vodincourt',
 'brewerylane',
 'bluchers',
 'illwillers',
 'concealmentodin',
 'uncontradicted',
 'objectit',
 'wonderments',
 'aloudmarthas',
 'detail',
 'boymy',
 'redemption',
 'has',
 'wordmiss',
 'flymore',
 'manure',
 'whipcorrected',
 'ee',
 'handkerchiefodins',
 'dromedary',
 'youthyouth',
 'illtaste',


In [None]:
# 나이브베이즈를 활용한 유사 문서 검색

In [39]:
pd.Series(y).value_counts()

3    15063
0    13235
2    11554
4     7805
1     7222
dtype: int64

In [61]:
target = np.array(y)
target[:10]

array([3, 2, 1, 4, 3, 4, 3, 2, 0, 4])

In [34]:
# CountVectorize

from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

(54879, 34719)

In [35]:
# 몇 개 안되는 1의 갯수

import numpy as np

np.sum(X_train_counts.toarray()[0])

46

In [36]:
# Tfidf 적용

from sklearn.feature_extraction.text import TfidfTransformer

tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(54879, 34719)

In [38]:
# Multinomial Naive Bayes 적용

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tf, y)

In [55]:
test_x.text

0        not at all i think she is one of the most char...
1        no replied he with sudden consciousness not to...
2        as the lady had stated her intention of scream...
3        and then suddenly in the silence i heard a sou...
4        his conviction remained unchanged so far as i ...
                               ...                        
19612    at the end of another day or two odin growing ...
19613    all afternoon we sat together mostly in silenc...
19614     odin having carried his thanks to odin procee...
19615    soon after this upon odins leaving the room ma...
19616    and all the worse for the doomed man that the ...
Name: text, Length: 19617, dtype: object

In [56]:
test_x.text[0]

'not at all i think she is one of the most charming young ladies i ever met and might have been most useful in such work as we have been doing she had a decided genius that way witness the way in which she preserved that agra plan from all the other papers of her father but love is an emotional thing and whatever is emotional is opposed to that true cold reason which i place above all things i should never marry myself lest i bias my judgment'

In [40]:
# 간편 테스트

docs_new = [test_x.text[0], test_x.text[1]]
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, author in zip(docs_new, predicted):
    print('%r => %s' % (doc, y))

'“Not at all. I think she is one of the most charming young ladies I ever met, and might have been most useful in such work as we have been doing. She had a decided genius that way: witness the way in which she preserved that Agra plan from all the other papers of her father. But love is an emotional thing, and whatever is emotional is opposed to that true cold reason which I place above all things. I should never marry myself, lest I bias my judgment.”' => ['3', '2', '1', '4', '3', '4', '3', '2', '0', '4', '2', '4', '0', '1', '4', '2', '1', '3', '4', '0', '0', '3', '2', '0', '1', '0', '4', '1', '2', '2', '3', '0', '1', '2', '2', '4', '0', '4', '0', '3', '2', '2', '2', '0', '2', '0', '4', '3', '4', '4', '2', '2', '0', '1', '0', '4', '0', '3', '1', '2', '3', '4', '3', '0', '0', '3', '0', '4', '3', '0', '0', '4', '1', '2', '4', '4', '3', '3', '3', '0', '0', '0', '3', '4', '4', '4', '4', '3', '0', '3', '2', '0', '4', '1', '1', '0', '2', '4', '3', '0', '4', '2', '3', '2', '2', '3', '1', '0

## Question

### 머신러닝 알고리즘을 사용하여서 소설 문장들의 특징을 도출하고 학습시켜 소설의 저자를 맞출 수 있을 지

### 아니면, Test data의 author를 0~5까지 랜덤으로 설정하고 학습한 모델을 Test Data에 적용시켜 얼마나 맞출 수 있는지 성능 파악으로 가도 될지