In [1]:
# 필요한 모듈 import
import codecs
import re
import numpy as np
from bs4 import BeautifulSoup
from sklearn.cross_validation import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from konlpy.tag import Mecab
mecab = Mecab(dicpath="C:\\mecab\\mecab-ko-dic")  # window에서 Mecab을 사용하기 위해 설정 후 사용합니다.



In [2]:
# 데이터 읽어오기
def Data_Read(path):
    with codecs.open(path, 'r', 'utf-8') as f:
        docs = f.read()
    bs = BeautifulSoup(docs, "lxml")
    docsList = bs.find_all('c')
    
    # 데이터 
    docs = []
    for doc in docsList:
        cited = doc.get('citedarticle')
        citing = doc.get('citingarticle')
        clue = doc.get('clue')
        sentiment = doc.get('sentiment')
        sentiments = [str(l.text) for l in doc.find_all('s')] # l.text로 인해 target tag 역시 사라집니다.
        docs.append([cited, citing, sentiment, clue, ' '.join(sentiments)])
    return docs

In [3]:
docs = Data_Read('Citation_Data.txt')

In [4]:
# Clue, Sentiments, Sentiment로 데이터 재정리
pnn = {'POS':2, 'NEG':1, 'NEU':0, 'NEU,NEU':0}
X = np.array([l[4] for l in docs])
Y = np.array([pnn[l[2]] for l in docs])
clue = np.array([l[3] for l in docs])

In [5]:
# 데이터를 4/5는 train set, 1/5는 test set으로 이용합니다.(비율을 맞춰 나눕니다.)
skf = StratifiedKFold(Y, n_folds=5, shuffle=True)
nb = []
svm = []
lg = []
knn = []
dt = []
pnlist = ['SY', 'SSO', 'SC'] # 제거할 형태소
for train, test in skf:
    trainX, trainY, trainClue = X[train], Y[train], clue[train]
    testX, testY, testClue = X[test], Y[test], clue[test]
    
    '''
    형태소 분석기는 Mecab을 이용합니다.
    Train Clue를 단어 Feature로 사용하기 위하여 형태소 분석을하여 단어 사전을 구축합니다.
    이 부분에서 추출한 feature가 학습시 이용됩니다.
    '''
    word_dict = []
    clue_dict = []
    if True: # 형태소를 feature로 사용합니다.
        for doc in trainX:
            word_dict += [i[0]+'/'+i[1] for i in mecab.pos(doc.replace(' ', '')) if not i[1] in pnlist]
        word_dict = set(word_dict)
        word_dict = {w:i for i, w in enumerate(word_dict)}
    elif False: # clue를 형태소 분석하여 feature로 사용합니다.
        for c in trainClue:
            if c:
                word_dict += [i[0]+'/'+i[1] for i in mecab.pos(c.replace(' ', '')) if not i[1] in pnlist]
        word_dict = set(word_dict)
        word_dict = {w:i for i, w in enumerate(word_dict)}
            
    '''
    clue 형태소 분석 clue_dict 만들기
    '''
    if True:
        for c in trainClue:
            if c:
                clue_dict += [i[0]+'/'+i[1] for i in mecab.pos(c.replace(' ', '')) if not i[1] in pnlist]
        clue_dict = set(clue_dict)
        clue_dict = {w:i for i, w in enumerate(word_dict)}
    
    
    # 데이터를 넣을 공간을 만듭니다.
    train_X = np.zeros((len(trainY), len(word_dict)), dtype=np.int32)
    test_X = np.zeros((len(testY), len(word_dict)), dtype=np.int32)
    
    # 데이터들의 문장을 띄어쓰기를 제거하고 형태소 분석하고, 기타 기호 SY 형태소를 제거하였음
    # 띄어쓰기를 제거한 이유는 다음과 같습니다.
    # 문장을 읽어올때 잘못된 띄어쓰기 문제가 발생하였기 때문에 띄어쓰기를 제거하여 형태소 분석을 합니다. 
    # Train Set
    for i, doc in enumerate(trainX):
        doc = [i[0]+'/'+i[1] for i in mecab.pos(doc.replace(' ', '')) if not i[1] in pnlist]
        for w in doc:
            try:
                train_X[word_dict[w], i] += 1
                if w in clue_dict:
                    train_X[word_dict[w], i] += 1
            except:
                pass

    # Test Set
    for i, doc in enumerate(testX):
        doc = [i[0]+'/'+i[1] for i in mecab.pos(doc.replace(' ', '')) if not i[1] in pnlist]
        for w in doc:
            try:
                test_X[word_dict[w], i] += 1
            except:
                pass
            
    # NaiveBayes
    clf = GaussianNB()
    clf.fit(train_X, trainY)
    nb.append(classification_report(testY, clf.predict(test_X), target_names=['NEU', 'NEG', 'POS']).split())
    
    # SVM Classification
    clf = LinearSVC()
    clf.fit(train_X, trainY)
    svm.append(np.array(classification_report(testY, clf.predict(test_X), target_names=['NEU', 'NEG', 'POS']).split()))
    
    # LogisticRegression
    clf = LogisticRegression()
    clf.fit(train_X, trainY)
    lg.append(np.array(classification_report(testY, clf.predict(test_X), target_names=['NEU', 'NEG', 'POS']).split()))
    
    # KNN = 5
    clf = KNeighborsClassifier(n_neighbors=3)
    clf.fit(train_X, trainY)
    knn.append(np.array(classification_report(testY, clf.predict(test_X), target_names=['NEU', 'NEG', 'POS']).split()))
    
    # dt
    clf = DecisionTreeClassifier()
    clf.fit(train_X, trainY)
    dt.append(np.array(classification_report(testY, clf.predict(test_X), target_names=['NEU', 'NEG', 'POS']).split()))
    

def mean(np_array):
    arr = []
    for i in range(np_array.shape[1]):
        arr.append(np.mean(np_array[:,i]))
    return np.round(np.array(arr), 2)
    
def data_report(array):
    NEU = np.array(array)[:, 5:][:,:3].astype(dtype=np.float32)
    NEG = np.array(array)[:, 5:][:,5:8].astype(dtype=np.float32)
    POS = np.array(array)[:, 5:][:,10:13].astype(dtype=np.float32)
    avg_total = np.array(array)[:,5:][:,17:20].astype(dtype=np.float32)
    print("%24s%12s%12s" %("precision", "recall", "f1-score"))
    a, b, c = mean(NEU)
    print("%12s        %0.2f        %0.2f        %0.2f" %("NEU", a, b, c))
    a, b, c = mean(NEG)
    print("%12s        %0.2f        %0.2f        %0.2f" %("NEG", a, b, c))
    a, b, c = mean(POS)
    print("%12s        %0.2f        %0.2f        %0.2f" %("POS", a, b, c))
    a, b, c = mean(avg_total)
    print("%12s        %0.2f        %0.2f        %0.2f" %("avg / total", a, b, c))

print("Linear SVM")
data_report(svm)

print("\nNaive Bayes")
data_report(nb)

print("\nLogistic Regression")
data_report(lg)

print("\nKNN Classification")
data_report(knn)

print("\nDecisionTree")
data_report(dt)

  'precision', 'predicted', average, warn_for)


Linear SVM
               precision      recall    f1-score
         NEU        0.71        0.92        0.80
         NEG        0.03        0.01        0.02
         POS        0.13        0.04        0.06
 avg / total        0.53        0.66        0.58

Naive Bayes
               precision      recall    f1-score
         NEU        0.79        0.16        0.26
         NEG        0.11        0.73        0.19
         POS        0.17        0.14        0.15
 avg / total        0.61        0.22        0.23

Logistic Regression
               precision      recall    f1-score
         NEU        0.71        0.96        0.82
         NEG        0.06        0.03        0.04
         POS        0.17        0.02        0.04
 avg / total        0.55        0.70        0.60

KNN Classification
               precision      recall    f1-score
         NEU        0.72        0.82        0.74
         NEG        0.05        0.13        0.05
         POS        0.28        0.05        0.08
 avg