In [2]:
import pandas as pd
import warnings 
warnings.filterwarnings(action='ignore')
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from nltk import word_tokenize
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline

## 데이터 확인

In [5]:
# 파일 불러오기

train = pd.read_csv('./open/train.csv', encoding='utf-8')
test_x = pd.read_csv('./open/test_x.csv', encoding='utf-8')
submission = pd.read_csv('./open/sample_submission.csv', encoding='utf-8')

### train data
- 실제 작가와 작가가 작성한 Text가 라벨링된 데이터프레임

In [6]:
train

Unnamed: 0,index,text,author
0,0,"He was almost choking. There was so much, so m...",3
1,1,"“Your sister asked for it, I suppose?”",2
2,2,"She was engaged one day as she walked, in per...",1
3,3,"The captain was in the porch, keeping himself ...",4
4,4,"“Have mercy, gentlemen!” odin flung up his han...",3
...,...,...,...
54874,54874,"“Is that you, Mr. Smith?” odin whispered. “I h...",2
54875,54875,"I told my plan to the captain, and between us ...",4
54876,54876,"""Your sincere well-wisher, friend, and sister...",1
54877,54877,“Then you wanted me to lend you money?”,3


In [7]:
train.author.unique(), train.author.value_counts()

(array([3, 2, 1, 4, 0]),
 3    15063
 0    13235
 2    11554
 4     7805
 1     7222
 Name: author, dtype: int64)

### text_x
- train data를 학습시켜 어떤 작가가 작성했는지 분석해야할 데이터프레임

In [8]:
test_x

Unnamed: 0,index,text
0,0,“Not at all. I think she is one of the most ch...
1,1,"""No,"" replied he, with sudden consciousness, ""..."
2,2,As the lady had stated her intention of scream...
3,3,“And then suddenly in the silence I heard a so...
4,4,His conviction remained unchanged. So far as I...
...,...,...
19612,19612,"At the end of another day or two, odin growing..."
19613,19613,"All afternoon we sat together, mostly in silen..."
19614,19614,"odin, having carried his thanks to odin, proc..."
19615,19615,"Soon after this, upon odin's leaving the room,..."


### submission
- 제출 형태
- 가로 index : 작가 명

In [9]:
submission

Unnamed: 0,index,0,1,2,3,4
0,0,0,0,0,0,0
1,1,0,0,0,0,0
2,2,0,0,0,0,0
3,3,0,0,0,0,0
4,4,0,0,0,0,0
...,...,...,...,...,...,...
19612,19612,0,0,0,0,0
19613,19613,0,0,0,0,0
19614,19614,0,0,0,0,0
19615,19615,0,0,0,0,0


## 전처리

### train_test_split

In [16]:
X = train.loc[:, 'text']
y = train.loc[:, 'author']

In [59]:
#####

In [61]:
text = train[['text']]
text.head()

Unnamed: 0,text
0,"He was almost choking. There was so much, so m..."
1,"“Your sister asked for it, I suppose?”"
2,"She was engaged one day as she walked, in per..."
3,"The captain was in the porch, keeping himself ..."
4,"“Have mercy, gentlemen!” odin flung up his han..."


In [62]:
text['text'] = text.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)
text.head()

Unnamed: 0,text
0,"[He, was, almost, choking, ., There, was, so, ..."
1,"[“, Your, sister, asked, for, it, ,, I, suppos..."
2,"[She, was, engaged, one, day, as, she, walked,..."
3,"[The, captain, was, in, the, porch, ,, keeping..."
4,"[“, Have, mercy, ,, gentlemen, !, ”, odin, flu..."


In [63]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
text['text'] = text['text'].apply(lambda x: [word for word in x if word not in (stop)])
text.head()

Unnamed: 0,text
0,"[He, almost, choking, ., There, much, ,, much,..."
1,"[“, Your, sister, asked, ,, I, suppose, ?, ”]"
2,"[She, engaged, one, day, walked, ,, perusing, ..."
3,"[The, captain, porch, ,, keeping, carefully, w..."
4,"[“, Have, mercy, ,, gentlemen, !, ”, odin, flu..."


In [64]:
from nltk.stem import WordNetLemmatizer
text['text'] = text['text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])
text.head()

Unnamed: 0,text
0,"[He, almost, choke, ., There, much, ,, much, w..."
1,"[“, Your, sister, ask, ,, I, suppose, ?, ”]"
2,"[She, engage, one, day, walk, ,, peruse, Jane,..."
3,"[The, captain, porch, ,, keep, carefully, way,..."
4,"[“, Have, mercy, ,, gentlemen, !, ”, odin, fli..."


In [65]:
tokenized_doc = text['text'].apply(lambda x : [word for word in x if len(word) > 3])
tokenized_doc[:5]

0    [almost, choke, There, much, much, want, stran...
1                              [Your, sister, suppose]
2    [engage, walk, peruse, Jane, last, letter, dwe...
3    [captain, porch, keep, carefully, treacherous,...
4    [Have, mercy, gentlemen, odin, fling, hand, wr...
Name: text, dtype: object

In [60]:
#####

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

### 문장 토큰화

In [46]:
from nltk.tokenize import sent_tokenize

print(sent_tokenize(X_train[0]))

['He was almost choking.', 'There was so much, so much he wanted to say, but strange exclamations were all that came from his lips.', 'The Pole gazed fixedly at him, at the bundle of notes in his hand; looked at odin, and was in evident perplexity.']


### 벡터라이즈

#### countvectorizer

In [55]:
from sklearn.feature_extraction.text import CountVectorizer

# Count Vectorization으로 feature extraction 변환 수행. 
cnt_vect = CountVectorizer()
cnt_vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [59]:
X_train_cnt_vect = cnt_vect.transform(X_train)
X_test_cnt_vect = cnt_vect.transform(X_test)
print('X_train의 CountVectorizer Shape:', X_train_cnt_vect.shape, X_test_cnt_vect.shape)

X_train의 CountVectorizer Shape: (43903, 32420) (10976, 32420)


In [60]:
print(cnt_vect.vocabulary_)



In [61]:
len(cnt_vect.vocabulary_)

32420

In [62]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# LogisticRegression을 이용하여 학습/예측/평가 수행.
lr_clf = LogisticRegression()
lr_clf.fit(X_train_cnt_vect, y_train)
pred = lr_clf.predict(X_test_cnt_vect)
print('CountVectorized Logistic Regression의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, pred)))

CountVectorized Logistic Regression의 예측 정확도는 0.729


##### stop_words, 'english' 추가

In [65]:
cnt_vect = CountVectorizer(max_features=2000, stop_words='english')
cnt_vect.fit(X_train)

X_train_cnt_vect = cnt_vect.transform(X_train)
X_test_cnt_vect = cnt_vect.transform(X_test)

lr_clf = LogisticRegression()
lr_clf.fit(X_train_cnt_vect, y_train)
pred = lr_clf.predict(X_test_cnt_vect)
print('CountVectorized Logistic Regression의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, pred)))

CountVectorized Logistic Regression의 예측 정확도는 0.641


In [21]:
# n gram 확인 -- 노트북 사양으로 확인 불가

In [22]:
cnt_vect = CountVectorizer(ngram_range=(2,2))
cnt_vect.fit(X_train)
ftr_vect = cnt_vect.transform(X_train)
print(type(ftr_vect), ftr_vect.shape)
print(cnt_vect.vocabulary_)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



#### tfidfvectorizer

In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Vectorization 적용하여 학습 데이터셋과 테스트 데이터 셋 변환.
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

# LogisticRegression을 이용하여 학습/예측/평가 수행.
lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
print('TF-IDF Logistic Regression의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, pred)))

TF-IDF Logistic Regression의 예측 정확도는 0.739


In [31]:
tfidf_X = tfidf_vect.transform(X_train)
tfidf_X

<54879x34416 sparse matrix of type '<class 'numpy.float64'>'
	with 857645 stored elements in Compressed Sparse Row format>

In [32]:
print(type(tfidf_X), tfidf_X.shape)

<class 'scipy.sparse.csr.csr_matrix'> (54879, 34416)


In [69]:
tfidf_vect.vocabulary_

{'it': 15840,
 'was': 31425,
 'odin': 19788,
 'said': 24747,
 'one': 19946,
 'for': 11833,
 'mr': 18905,
 'he': 13722,
 'stepping': 27318,
 'into': 15619,
 'the': 28691,
 'room': 24465,
 'and': 1815,
 'handing': 13488,
 'my': 19046,
 'friend': 12168,
 'letter': 16895,
 'soon': 26659,
 'after': 1393,
 'this': 28806,
 'miss': 18522,
 'came': 4771,
 'in': 14867,
 'could': 7047,
 'not': 19512,
 'help': 13865,
 'being': 3304,
 'diverted': 9067,
 'by': 4650,
 'perplexity': 21098,
 'of': 19849,
 'her': 13896,
 'first': 11479,
 'answer': 1914,
 'to': 29081,
 'herself': 13941,
 'resulting': 24044,
 'she': 25640,
 'supposed': 27984,
 'from': 12210,
 'doubt': 9258,
 'what': 31663,
 'might': 18314,
 'be': 3109,
 'impatience': 14710,
 'say': 24956,
 'every': 10529,
 'thing': 28775,
 'well': 31609,
 'then': 28715,
 'only': 19955,
 'meant': 18041,
 'that': 28684,
 'your': 32349,
 'attributing': 2578,
 'brother': 4390,
 'wish': 31971,
 'dancing': 7695,
 'with': 31991,
 'good': 12890,
 'nature': 19161,

##### stop words 필터링을 추가하고 ngram을 기본 (1,1)에서 (1,2)로 변경하여 피처 벡터화

In [70]:
tfidf_vect = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
print('TF-IDF Vectorized Logistic Regression 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test ,pred)))

TF-IDF Vectorized Logistic Regression 의 예측 정확도는 0.723


In [71]:
tfidf_vect.vocabulary_

{'odin': 322620,
 'said': 402919,
 'odin said': 327308,
 'mr': 308155,
 'stepping': 451794,
 'room': 397741,
 'handing': 204775,
 'friend': 180162,
 'letter': 263639,
 'mr odin': 308334,
 'said stepping': 404872,
 'stepping room': 451816,
 'room handing': 398091,
 'handing friend': 204784,
 'friend letter': 180484,
 'soon': 439812,
 'miss': 301121,
 'came': 60729,
 'help': 215477,
 'diverted': 126267,
 'perplexity': 347048,
 'answer': 18333,
 'resulting': 391898,
 'supposed': 464307,
 'doubt': 130252,
 'impatience': 230090,
 'say': 408677,
 'thing': 476270,
 'soon miss': 440152,
 'miss odin': 301211,
 'odin came': 323418,
 'came odin': 61333,
 'odin help': 325247,
 'help diverted': 215568,
 'diverted perplexity': 126274,
 'perplexity answer': 347049,
 'answer resulting': 18591,
 'resulting supposed': 391902,
 'supposed doubt': 464345,
 'doubt said': 130589,
 'said impatience': 403942,
 'impatience say': 230146,
 'say thing': 409726,
 'meant': 293688,
 'attributing': 29872,
 'brother': 

##### min-df 조정

In [76]:
tfidf_vect = TfidfVectorizer(stop_words='english', max_df=10)
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
print('TF-IDF Vectorized Logistic Regression 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test ,pred)))

TF-IDF Vectorized Logistic Regression 의 예측 정확도는 0.442


In [82]:
tfidf_vect = TfidfVectorizer(stop_words='english', max_features=20000)
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
print('TF-IDF Vectorized Logistic Regression 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test ,pred)))

TF-IDF Vectorized Logistic Regression 의 예측 정확도는 0.722


##### GridSearchCV로 LogisticRegression C 하이퍼 파라미터 튜닝

In [73]:
from sklearn.model_selection import GridSearchCV

# 최적 C 값 도출 튜닝 수행. CV는 3 Fold셋으로 설정.
params = { 'C': [0.01, 0.1, 1, 5, 10]}
grid_cv_lr = GridSearchCV(lr_clf, param_grid=params, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_cv_lr.fit(X_train_tfidf_vect, y_train)
print('Logistic Regression best C parameter:', grid_cv_lr.best_params_)

# 최적 C 값으로 학습된 grid_cv로 예측 수행하고 정확도 평가
pred = grid_cv_lr.predict(X_test_tfidf_vect)
print('TF-IDF Vectorized Logistic Regression 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, pred)))

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed:  4.1min remaining:  3.6min
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  4.3min finished


Logistic Regression best C parameter: {'C': 10}
TF-IDF Vectorized Logistic Regression 의 예측 정확도는 0.740


In [18]:
# 전체 말뭉치

all_words = set(word.lower() for sentence in X for word in word_tokenize(sentence))
all_words

{'westwhom',
 'abstracted',
 'numbered',
 'shunned',
 'canted',
 'venetians',
 'perverted',
 'impetuosity',
 'agitatedvery',
 'creatureyou',
 'presses',
 'otherthats',
 'jurymenfour',
 'shoving',
 'bun',
 'thunder',
 'customer',
 'affectionate',
 'yesratheri',
 'thepie',
 'ishes',
 'perceptive',
 'brothers',
 'quarrels',
 'selfsatisfaction',
 'model',
 'crystallisation',
 'holdenand',
 'attorney',
 'broughtup',
 'wasfor',
 'unkind',
 'alpinestock',
 'suharev',
 'marker',
 'greasewhich',
 'simpson',
 'realitya',
 'armchairs',
 'goodhearted',
 'badge',
 'vantageground',
 'girlwhat',
 'bounty',
 'wholehearted',
 'deputed',
 'potkins',
 'hereit',
 'creepers',
 'poora',
 'trailing',
 'vodincourt',
 'brewerylane',
 'bluchers',
 'illwillers',
 'concealmentodin',
 'uncontradicted',
 'objectit',
 'wonderments',
 'aloudmarthas',
 'detail',
 'boymy',
 'redemption',
 'has',
 'wordmiss',
 'flymore',
 'manure',
 'whipcorrected',
 'ee',
 'handkerchiefodins',
 'dromedary',
 'youthyouth',
 'illtaste',


In [None]:
# 나이브베이즈를 활용한 유사 문서 검색

In [39]:
pd.Series(y).value_counts()

3    15063
0    13235
2    11554
4     7805
1     7222
dtype: int64

In [61]:
target = np.array(y)
target[:10]

array([3, 2, 1, 4, 3, 4, 3, 2, 0, 4])

In [34]:
# CountVectorize

from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

(54879, 34719)

In [35]:
# 몇 개 안되는 1의 갯수

import numpy as np

np.sum(X_train_counts.toarray()[0])

46

In [36]:
# Tfidf 적용

from sklearn.feature_extraction.text import TfidfTransformer

tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(54879, 34719)

In [38]:
# Multinomial Naive Bayes 적용

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tf, y)

In [55]:
test_x.text

0        not at all i think she is one of the most char...
1        no replied he with sudden consciousness not to...
2        as the lady had stated her intention of scream...
3        and then suddenly in the silence i heard a sou...
4        his conviction remained unchanged so far as i ...
                               ...                        
19612    at the end of another day or two odin growing ...
19613    all afternoon we sat together mostly in silenc...
19614     odin having carried his thanks to odin procee...
19615    soon after this upon odins leaving the room ma...
19616    and all the worse for the doomed man that the ...
Name: text, Length: 19617, dtype: object

In [56]:
test_x.text[0]

'not at all i think she is one of the most charming young ladies i ever met and might have been most useful in such work as we have been doing she had a decided genius that way witness the way in which she preserved that agra plan from all the other papers of her father but love is an emotional thing and whatever is emotional is opposed to that true cold reason which i place above all things i should never marry myself lest i bias my judgment'

In [40]:
# 간편 테스트

docs_new = [test_x.text[0], test_x.text[1]]
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, author in zip(docs_new, predicted):
    print('%r => %s' % (doc, y))

'“Not at all. I think she is one of the most charming young ladies I ever met, and might have been most useful in such work as we have been doing. She had a decided genius that way: witness the way in which she preserved that Agra plan from all the other papers of her father. But love is an emotional thing, and whatever is emotional is opposed to that true cold reason which I place above all things. I should never marry myself, lest I bias my judgment.”' => ['3', '2', '1', '4', '3', '4', '3', '2', '0', '4', '2', '4', '0', '1', '4', '2', '1', '3', '4', '0', '0', '3', '2', '0', '1', '0', '4', '1', '2', '2', '3', '0', '1', '2', '2', '4', '0', '4', '0', '3', '2', '2', '2', '0', '2', '0', '4', '3', '4', '4', '2', '2', '0', '1', '0', '4', '0', '3', '1', '2', '3', '4', '3', '0', '0', '3', '0', '4', '3', '0', '0', '4', '1', '2', '4', '4', '3', '3', '3', '0', '0', '0', '3', '4', '4', '4', '4', '3', '0', '3', '2', '0', '4', '1', '1', '0', '2', '4', '3', '0', '4', '2', '3', '2', '2', '3', '1', '0

## Question

### 머신러닝 알고리즘을 사용하여서 소설 문장들의 특징을 도출하고 학습시켜 소설의 저자를 맞출 수 있을 지

### 아니면, Test data의 author를 0~5까지 랜덤으로 설정하고 학습한 모델을 Test Data에 적용시켜 얼마나 맞출 수 있는지 성능 파악으로 가도 될지