# task145

## 데이터 불러오기

In [2]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

train_path = '../data/.train/.task145/data/train.tsv'
test_path = '../data/.train/.task145/data/test.tsv'

In [3]:
train_df = pd.read_csv(train_path, sep='\t')
train_df.head()

Unnamed: 0,comment,tag
0,안경 써도 멋있네요 ㅎㅎ,1
1,1117공병단도..,0
2,근데 국립묘지? 뭐냐? 이게?,2
3,내가 지금 현역이였으면 웃고있을거같은데 ㅋㅋ 야외훈련 안하자나 그리고 밖에도 거의 ...,0
4,저거 아직도 손으로 하고있네. 자주도하장비좀 도입해서 인력난 줄이고 기계로 좀 대체하자,2


In [4]:
def read_documents(filename):
    with open(filename, encoding="utf-8") as f: # 윈도우는 꼭 encoding 해줘야 함
        documents = [line.split('\t') for line in f.read().splitlines()] 
        documents = documents[1:] # 첫번째 줄이 카테고리 이름적혀있는 줄이라서 날려버림
        
    return documents

test = read_documents(test_path)
test_df = pd.DataFrame(test)
test_df.columns = ["comment", "tag"]
test_df.head()

Unnamed: 0,comment,tag
0,쪼수미 생각나,0
1,K9대대에서 화학병으로 들어갔는데 잡부일만함,0
2,이거 중국이 만들었다. 갑자기 생길일은 없잖아,0
3,최고의 여전사입니다 충성!!,0
4,김민석 7사단인뎅 어디야 저기,0


In [5]:
train_df['tag'].value_counts()

0    7100
1    2847
2    1373
Name: tag, dtype: int64

## 데이터 전처리

In [6]:
import konlpy 
from konlpy.tag import Mecab, Kkma, Okt, Komoran
import json
import os
import re
from pprint import pprint

def text_cleaning(doc):
    # 한국어를 제외한 글자를 제거하는 함수.
    doc = re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "", doc)
    return doc

- train data 전처리

In [7]:
for i in range(len(train_df)):
    text = text_cleaning(train_df['comment'][i])
    train_df['comment'][i] = text

## 모델링

- BoW(Bag of Words) 

In [8]:
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

y_target = train_df['tag']
X_features = train_df.drop('tag', axis=1, inplace=False)
X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, 
                                                    test_size=0.2,
                                                   random_state=156)

- 형태소 분석기 Mecab 사용
- 텍스트에서 품사 정보를 부착하여 반환하는 pos로 텍스트 토큰화

In [9]:
import konlpy 
from konlpy.tag import Mecab, Kkma, Okt, Komoran
import json
import os
import re
from pprint import pprint

# 형태소 분석기 불러오기
mecab = Mecab()

class MyTokenizer:
    def __init__(self, tagger):
        self.tagger = tagger
    def __call__(self, sent):
        pos = self.tagger.pos(sent)
        pos = ['{}/{}'.format(word,tag) for word, tag in pos]
        return pos

my_tokenizer = MyTokenizer(Mecab())

- Pipeline을 이용하여 TfidfVectorizer와 Support Vector Machine을 결합하여 모델 생성

In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

pipeline = Pipeline([
    ('cnt_vect', TfidfVectorizer(tokenizer=my_tokenizer, ngram_range=(1,2), max_df=0.95)),
    ('lr_clf', SVC(kernel='linear'))])

pipeline.fit(X_train['comment'], y_train)
pred = pipeline.predict(X_test['comment'])

print('예측 F1_score: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}'.format(f1_score(y_test, pred, average="macro"),
                                                precision_score(y_test, pred, average="macro"), recall_score(y_test, pred, average="macro")))

예측 F1_score: 0.7981, 정밀도: 0.8373, 재현율: 0.7709


## 하이퍼 파라미터 조정

### GridSearchCV 이용하여 최적의 모델과 파라미터 값으로 예측 진행

In [11]:
SVC().get_params().keys()

dict_keys(['C', 'break_ties', 'cache_size', 'class_weight', 'coef0', 'decision_function_shape', 'degree', 'gamma', 'kernel', 'max_iter', 'probability', 'random_state', 'shrinking', 'tol', 'verbose'])

In [14]:
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

SVC_linear = SVC(random_state=0)

tfidf_vect = TfidfVectorizer(tokenizer=my_tokenizer, ngram_range=(1,2), max_df=0.95)
tfidf_vect.fit(X_train['comment'])
tfidf_matrix_train = tfidf_vect.transform(X_train['comment'])

# 하이퍼라미터 C, gamma, kernel의 최적화를 위해 GridSearchCV이용
param_range = [0.01, 0.1, 1, 10]
params = [{'C':param_range,
          'gamma':param_range,
          'kernel':['rbf']},
          
        {'C':param_range,
         'kernel':['linear']}]

grid_cv = GridSearchCV(SVC_linear, param_grid=params, cv=3, scoring='f1_macro', verbose=1)
grid_cv.fit(tfidf_matrix_train, y_train)
print(grid_cv.best_params_, round(grid_cv.best_score_, 4))

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 11.1min finished


{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'} 0.7492


In [16]:
tfidf_matrix_test = tfidf_vect.transform(X_test['comment'])

# classifier 는 GridSearchCV에서 최적 파라미터로 학습된 classifier를 그대로 이용
best_estimator = grid_cv.best_estimator_
preds = best_estimator.predict(tfidf_matrix_test)

print('예측 F1_score: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}'.format(f1_score(y_test, preds, average="macro"),
                                                precision_score(y_test, preds, average="macro"), recall_score(y_test, preds, average="macro")))

예측 F1_score: 0.8045, 정밀도: 0.8266, 재현율: 0.7868


- 모델 save

In [23]:
# Save the model

import joblib
joblib.dump(best_estimator, './model/model_145.pkl') 

grid_model = joblib.load('./model/model_145.pkl') 

## Test data 전처리 & 예측

In [18]:
for i in range(len(test_df)):
    text = text_cleaning(test_df['comment'][i])
    test_df['comment'][i] = text
    
    
test_final = tfidf_vect.transform(test_df['comment'])

preds_test = grid_model.predict(test_final)
preds_test[:30]

array([0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 2, 0, 0, 2,
       1, 1, 0, 0, 1, 1, 1, 0])

In [19]:
test_df['tag']= preds_test
submission = test_df['tag']

In [20]:
submission.head()

0    0
1    0
2    0
3    1
4    0
Name: tag, dtype: int64

## 결과 제출

In [21]:
submission.to_csv('./prediction/prediction_145.csv', index=False, header=False, sep="\t")

In [24]:
import zipfile

with zipfile.ZipFile('./model/model_145.zip', mode='w') as f:
    f.write('./육군_최종.ipynb', compress_type=zipfile.ZIP_DEFLATED)

# 압축파일에 모델 파일 추가
with zipfile.ZipFile('./model/model_145.zip', mode='a') as f:
    f.write('./model/model_145.pkl', compress_type=zipfile.ZIP_DEFLATED)

FileNotFoundError: [Errno 2] No such file or directory: './model_145.pkl'

In [None]:
from nipa.taskSubmit import nipa_submit

team_id="1345"
task_no="145"
result_path = './prediction/prediction_145.csv'
model_path = './model/model_145.pkl'
nipa_submit(team_id=team_id,
            task_no=task_no,
            result=result_path,
            model=model_path)