# NSMC 감성분석 data set
- data load 및 generation
- 간단한 EDA 와 ml 모델 돌리기
- Feature Engineering 시에는 다양한 노트북 참고 예정  
- Updated 1014


In [5]:
cd /content/drive/My Drive/AI_assignment/hackathon_1

/content/drive/My Drive/AI_assignment/hackathon_1


In [6]:
# ready
import pandas as pd
import numpy as np

import sys
np.set_printoptions(threshold=sys.maxsize)
import os
import tqdm
import warnings
warnings.filterwarnings(action='ignore')
import pickle
import joblib

from IPython.display import display
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

import re

# visualization
from matplotlib import pyplot as plt
plt.style.use('seaborn')
import seaborn as sns
%matplotlib inline

In [7]:
# Font
import matplotlib.font_manager as fm
fm._rebuild()
plt.rc('font', family='NanumGothic')
plt.rc('axes', unicode_minus=False)

In [25]:
# kras
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

# sklearn
from sklearn.model_selection import train_test_split,KFold
#from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

In [48]:
from gensim.models import Word2Vec

### 1) Load Data
데이터 형식  
ID \t document \t label (0:neg / 1:pos) 로 공지 

In [9]:
# token
train_token = joblib.load(os.path.join('data','train_moong.pickle'))
train_label = joblib.load(os.path.join('data','train_label_moong.pickle'))

test_token = joblib.load(os.path.join('data','test_moong.pickle'))
test_label = joblib.load(os.path.join('data','test_label_moong.pickle'))

# tfidf Feature
tfidf_vocab = joblib.load(os.path.join('data','vocab_list.pickle'))


In [15]:
train_token[:2], train_label[:2], len(train_token), len(test_token)

([['아', '더빙', '진짜', '짜증나네요', '목소리'],
  ['흠', '포스터', '보고', '초딩', '영화', '줄', '오버', '연기', '가볍지', '않구나']],
 0    0
 1    1
 Name: label, dtype: int64,
 146182,
 49157)

In [17]:
tfidf_vocab[:2], len(tfidf_vocab)

(['만들어', '구성'], 494)

train 데이터 갯수 : 146182  
test 데이터 갯수 : 49157  
tfidf 단어 갯수 : 494


### 2) Feature Engineering


* tfidf feature 494

In [52]:
# tfidf voca 들로 피쳐 생성해주는 함수
def make_tfidf_mat(word_token_list, vocab, num_vocab):
    # Countervector
    count_vector = CountVectorizer()
    count_vector.fit(vocab)     # 단어사전에 fitting
    
    tfidf_mat = np.zeros([len(word_token_list),num_vocab])
    
    for i,t in enumerate(word_token_list):
        t_vector = count_vector.transform([''.join(t)]).toarray()
        tfidf_mat[i] = t_vector

    return tfidf_mat

In [54]:
train_tfidf_mat = make_tfidf_mat(train_token, tfidf_vocab, 494)
train_tfidf_mat.shape

(146182, 494)

In [55]:
test_tfidf_mat = make_tfidf_mat(test_token, tfidf_vocab, 494)
test_tfidf_mat.shape

(49157, 494)

* word2vec 160  
* def 짜야함

In [49]:
# Word2Vec embedding # skip-gram
emb_model = Word2Vec(train_token, size=160, window = 3, min_count=1, iter=100, sg=1)

# save
emb_model.save('word2vec_train_146182.model')

# # load model
# emb_model = joblib.load(os.path.join('word2vec_train_1014.model'))

In [57]:
# word dict
word_table = {
    word : vec for word, vec in zip(
        emb_model.wv.index2word,
        np.array(emb_model.wv.syn0)
    )
}

emb_mat = np.zeros((len(train_token),160))

for i,morphs in enumerate(train_token):
    vector = np.array([word_table[morph] for morph in morphs])
    final_vector = np.mean(vector,axis=0)
    emb_mat[i] = final_vector

In [58]:
emb_mat.shape

(146182, 160)

* Save Features

In [60]:
# tfidf mat
with open('train_tfidf_mat_146182.pickle', 'wb') as f:
    pickle.dump(tfidf_mat, f, pickle.HIGHEST_PROTOCOL)

# embedding mat
with open('train_emb_mat_146182.pickle', 'wb') as f:
    pickle.dump(emb_mat, f, pickle.HIGHEST_PROTOCOL)

* Data Generation

In [62]:
final_mat = np.concatenate((tfidf_mat, emb_mat), axis = 1)
final_mat.shape, np.isnan(final_mat).sum()

((146182, 654), 26240)

In [69]:
# setting
X = np.nan_to_num(final_mat.copy(), copy=False)
X_emb = np.nan_to_num(emb_mat.copy(), copy=False)
X_tfidf = np.nan_to_num(tfidf_mat.copy(), copy=False)

y = train_label

### 3) Modeling

In [64]:
# ml model
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()
nb = GaussianNB()
svm = SVC()

In [65]:
def model_eval(model,X,y,save_model_name):
    
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3, random_state=42)
    
    clf = model.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test,y_pred)
    print(model,' accuracy : ', acc)

    # save model
    with open('train_' + save_model_name + '.pickle', 'wb') as f:
        pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)
    
    return acc

In [70]:
# NB
print(model_eval(nb,X,y,'nb'))

print(model_eval(nb,X_emb,y,'nb'))

print(model_eval(nb,X_tfidf,y,'nb'))

GaussianNB(priors=None, var_smoothing=1e-09)  accuracy :  0.5024056549994299
0.5024056549994299
GaussianNB(priors=None, var_smoothing=1e-09)  accuracy :  0.8129517728879261
0.8129517728879261
GaussianNB(priors=None, var_smoothing=1e-09)  accuracy :  0.5024056549994299
0.5024056549994299


tfidf 성능 구림

In [72]:
# dt
%time print(model_eval(dt,X,y,'dt'))

print(model_eval(dt,X_emb,y,'dt'))

print(model_eval(dt,X_tfidf,y,'dt'))

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')  accuracy :  0.6820430965682363
0.6820430965682363
CPU times: user 34.7 s, sys: 13 ms, total: 34.7 s
Wall time: 34.8 s
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')  accuracy :  0.680492532208

In [None]:
# knn
%time print(model_eval(knn,X,y,'knn'))

print(model_eval(knn,X_emb,y,'knn'))

print(model_eval(knn,X_tfidf,y,'knn'))

In [None]:
# svm
%time print(model_eval(svm,X,y,'svm'))

print(model_eval(svm,X_emb,y,'svm'))

print(model_eval(svm,X_tfidf,y,'svm'))