In [31]:
import pandas as pd
import warnings 
warnings.filterwarnings(action='ignore')
import numpy as np
import re
import nltk
from pathlib import Path
from sklearn.model_selection import train_test_split
from nltk import word_tokenize
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline

In [32]:
data_dir = Path('./open/')
feature_dir = Path('./open/build/feature')
val_dir = Path('./open/build/val')
tst_dir = Path('./open/build/tst')
sub_dir = Path('./open/build/sub')

trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test_x.csv'
sample_file = data_dir / 'sample_submission.csv'

target_col = 'author'
n_fold = 5
n_class = 5
seed = 13

In [33]:
algo_name = 'lr'
feature_name = 'tfidf'
model_name = f'{algo_name}_{feature_name}'

feature_file = feature_dir / f'{feature_name}.csv'
p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'
sub_file = sub_dir / f'{model_name}.csv'

## 데이터 확인

In [34]:
trn = pd.read_csv(trn_file, index_col=0)
print(trn.shape)
trn.head()

(54879, 2)


Unnamed: 0_level_0,text,author
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"He was almost choking. There was so much, so much he wanted to say, but strange exclamations were all that came from his lips. The Pole gazed fixedly at him, at the bundle of n...",3
1,"“Your sister asked for it, I suppose?”",2
2,"She was engaged one day as she walked, in perusing Jane’s last letter, and dwelling on some passages which proved that Jane had not written in spirits, when, instead of being ...",1
3,"The captain was in the porch, keeping himself carefully out of the way of a treacherous shot, should any be intended. He turned and spoke to us, “Doctor's watch on the lookout....",4
4,"“Have mercy, gentlemen!” odin flung up his hands. “Don’t write that, anyway; have some shame. Here I’ve torn my heart asunder before you, and you seize the opportunity and are ...",3


In [36]:
tst = pd.read_csv(tst_file, index_col=0)
print(tst.shape)
tst.head()

(19617, 1)


Unnamed: 0_level_0,text
index,Unnamed: 1_level_1
0,"“Not at all. I think she is one of the most charming young ladies I ever met, and might have been most useful in such work as we have been doing. She had a decided genius that ..."
1,"""No,"" replied he, with sudden consciousness, ""not to find it in YOU; for I cannot be ignorant that to you, to your goodness, I owe it all.--I feel it--I would express it if I c..."
2,"As the lady had stated her intention of screaming, of course she would have screamed at this additional boldness, but that the exertion was rendered unnecessary by a hasty knoc..."
3,“And then suddenly in the silence I heard a sound which sent my heart into my mouth. It was the clank of the levers and the swish of the leaking cylinder. He had set the engine...
4,"His conviction remained unchanged. So far as I know--and I believe his honest heart was transparent to me--he never wavered again, in his solemn certainty of finding her. His p..."


## 전처리

### train_test_split

In [79]:
X = train.loc[:, 'text']
y = train.loc[:, 'author']

In [80]:
X_test = test_x['text']

In [40]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

### TfidVectorize

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

# TF-IDF Vectorization 적용하여 학습 데이터셋과 테스트 데이터 셋 변환.
tfidf_vect = TfidfVectorizer(stop_words='english')
X = tfidf_vect.fit_transform(X)
X_test= tfidf_vect.transform(X_test)

In [42]:
print(X.shape, X_test.shape)

(54879, 34416) (19617, 34416)


In [43]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=13)

In [44]:
y.shape

(54879,)

In [62]:
p = np.zeros((X.shape[0], 5))
p_tst = np.zeros((X_test.shape[0], 5))
for i_skf, (i_trn, i_val) in enumerate(skf.split(X, y), 1):
    clf = LogisticRegression()
    clf.fit(X[i_trn], y[i_trn])
    p[i_val, :] = clf.predict_proba(X[i_val])
    p_tst += clf.predict_proba(X_test) / 5

[[0.01453383 0.00588608 0.03504109 0.93767076 0.00686824]
 [0.17910892 0.05286396 0.0950695  0.0412203  0.63173733]
 [0.07899712 0.02780522 0.66859321 0.02433983 0.20026463]
 ...
 [0.11644562 0.7340927  0.0478671  0.08425566 0.01733892]
 [0.78325583 0.04755676 0.07998399 0.03431026 0.05489316]
 [0.12162436 0.02929668 0.09726988 0.65344853 0.09836056]]
[[0.0445561  0.94625383 0.00277813 0.0041451  0.00226684]
 [0.18208263 0.02745707 0.61048466 0.0473237  0.13265193]
 [0.04336957 0.01471658 0.65210972 0.02658831 0.26321581]
 ...
 [0.95926553 0.01702643 0.01000604 0.00253261 0.01116939]
 [0.53006456 0.03945843 0.10317189 0.00864817 0.31865695]
 [0.13631606 0.13579214 0.49476031 0.18022384 0.05290766]]
[[0.5130261  0.27950854 0.08733437 0.07533561 0.04479538]
 [0.79122453 0.00558567 0.10184073 0.00963565 0.09171342]
 [0.14587545 0.05550277 0.16416063 0.13408739 0.50037376]
 ...
 [0.06524851 0.01983069 0.4154881  0.44470877 0.05472393]
 [0.04006037 0.16992438 0.28429882 0.42025332 0.0854631

In [61]:
for i_skf, (i_trn, i_val) in enumerate(skf.split(X, y), 1):
    print('i_skf',i_skf)
    print('i_trn',i_trn)
    print('i_val',i_val)

i_skf 1
i_trn [    1     2     3 ... 54875 54876 54878]
i_val [    0    18    22 ... 54867 54873 54877]
i_skf 2
i_trn [    0     1     3 ... 54875 54876 54877]
i_val [    2     5     7 ... 54869 54872 54878]
i_skf 3
i_trn [    0     2     3 ... 54876 54877 54878]
i_val [    1     8    11 ... 54848 54858 54864]
i_skf 4
i_trn [    0     1     2 ... 54875 54877 54878]
i_val [    3     4     9 ... 54870 54874 54876]
i_skf 5
i_trn [    0     1     2 ... 54876 54877 54878]
i_val [    6    20    21 ... 54865 54871 54875]


In [54]:
np.zeros((X.shape[0], 5))

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [56]:
np.zeros((X_test.shape[0], 5)).shape

(19617, 5)

In [46]:
print(f'Accuracy (SKF): {accuracy_score(y, np.argmax(p, axis=1)) * 100:8.4f}%')

Accuracy (SKF):  72.6981%


In [50]:
sub = pd.read_csv('./open/sample_submission.csv', index_col=0)
print(sub.shape)
sub.head()

(19617, 5)


Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,0


In [51]:
sub[sub.columns] = p_tst
sub.head()

Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.07223,0.405286,0.200331,0.267803,0.05435
1,0.265409,0.160464,0.074029,0.165691,0.334406
2,0.765312,0.050508,0.067845,0.038897,0.077438
3,0.149604,0.010156,0.556698,0.078937,0.204606
4,0.321536,0.146772,0.142566,0.234817,0.154309


In [97]:
sub.round(4)

Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.0722,0.4053,0.2003,0.2678,0.0543
1,0.2654,0.1605,0.0740,0.1657,0.3344
2,0.7653,0.0505,0.0678,0.0389,0.0774
3,0.1496,0.0102,0.5567,0.0789,0.2046
4,0.3215,0.1468,0.1426,0.2348,0.1543
...,...,...,...,...,...
19612,0.0240,0.9687,0.0016,0.0043,0.0015
19613,0.3094,0.0378,0.1624,0.0488,0.4415
19614,0.0441,0.9045,0.0098,0.0347,0.0069
19615,0.0455,0.8722,0.0244,0.0513,0.0066


### MultinomialNB 

In [104]:
X = train.loc[:, 'text']
y = train.loc[:, 'author']
X_test = test_x['text']

In [105]:
tfidf_vect = TfidfVectorizer(stop_words='english')
X = tfidf_vect.fit_transform(X)
X_test= tfidf_vect.transform(X_test)

In [106]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [107]:
p = np.zeros((X.shape[0], 5))
p_tst = np.zeros((X_test.shape[0], 5))
for i_skf, (i_trn, i_val) in enumerate(skf.split(X, y), 1):
    clf = LogisticRegression()
    clf.fit(X[i_trn], y[i_trn])
    p[i_val, :] = clf.predict_proba(X[i_val])
    p_tst += clf.predict_proba(X_test) / 5

In [108]:
print(f'Accuracy (SKF): {accuracy_score(y, np.argmax(p, axis=1)) * 100:8.4f}%')

Accuracy (SKF):  72.6981%


In [114]:
result = pd.DataFrame(p_tst, columns=['author 0', 'author 1', 'author 2', 'author 3', 'author 4'])

In [112]:
result.to_csv('prediction.csv', index=False)

In [149]:
(result.round(2) * 100).astype('int').astype('str') + '%'

Unnamed: 0,author 0,author 1,author 2,author 3,author 4
0,7%,41%,20%,27%,5%
1,27%,16%,7%,17%,33%
2,77%,5%,7%,4%,8%
3,15%,1%,56%,8%,20%
4,32%,15%,14%,23%,15%
...,...,...,...,...,...
19612,2%,97%,0%,0%,0%
19613,31%,4%,16%,5%,44%
19614,4%,90%,1%,3%,1%
19615,5%,87%,2%,5%,1%
