# 180408_feature_extractors

## 구현해볼 것들

- SentimentExtractor
- TfidfExtractor
- SentenceVectorizer
- ImportantWordsExtractor
- ScoreExpressionExtractor

In [4]:
from feature_extractors import ScoreExpressionExtractor

ImportError: cannot import name 'ScoreExpressionExtractor'

In [1]:
import os
import pickle
import sys
import warnings
import pandas as pd

class DummyFeatureExtractor:
    """
    Base dummy feature extractor
    """
    def extract_feature(self, raw_text, tokenized_text):
        """
        :params raw_text: raw text
        :params tokenized_text: tokenized text, it should be space seperated or list
        """
        return "tuple"

In [2]:
with open("../data/movie.txt") as f:
    data = f.readlines()
    data = [str(t.strip()) for t in data]
    data = pd.DataFrame(data, columns=['comment'])
    
with open("../data/movie_label.txt") as f:
    score = f.readlines()
    data['score'] = pd.DataFrame(score)
    data['score'] = data['score'].map(lambda x: int(x.strip()))

In [13]:
pd.set_option("Display.max_rows",100)

In [7]:
data.iloc[:10000]

Unnamed: 0,comment,score
0,아련한 향수를 떠올리게 만드는 추억의 영화,8
1,SF 코미디 영화사에서 가장 주목해야할 영화,9
2,mv00036133의 꿈이 고스란히 담겨진 최고의 영화,8
3,전편을 120% 활용하는 천재적인 속편.,9
4,1편만큼 재미있다. 3편만 조금 떨어짐.,8
5,역시 전편을 능가하는 속편은 지극히 드물다,7
6,무슨 설명이 더 필요한가?,10
7,mv00501003라는 이름만으로도 설레게 만드는 영화의 힘,10
8,이때참 ac00000559가 멋져 보였지,9
9,시원하면서도 심장을 꽉 조여온다.,10


In [4]:
import translation

In [5]:
from translation import google

In [9]:
import re
re_score = re.compile("[1-9]?[0-9]점")

In [10]:
re_score.findall("10점 주기엔 에바여서 9점 줌")

['10점', '9점']

In [97]:
from feature_extractors import ScoreExpressionExtractor

ImportError: cannot import name 'ScoreExpressionExtractor'

In [None]:
class FastTextVectorizer:
    """A dictionary that maps a word to FastText embedding."""

    def __init__(self, tokenizer, config):
        self.tokenizer = Twitter()
        self.vocabulary_size = config.vocabulary_size
        self.embedding_size = 256
        self.PAD_TOKEN = '<PAD>'
        self.UNK_TOKEN = '<UNK>'
        self.fasttext = None

    def build_dictionary(self, data):

        self.vocab_words, self.word2idx, self.idx2word = self._build_vocabulary(data)
        self.embedding = self.load_vectors()
        print(self.embedding.shape)

    def indexer(self, word):
        try:
            return self.word2idx[word]
        except:
            return self.word2idx['<UNK>']

    def _build_vocabulary(self, data):
        reviews = [review for review, label in data]
        tokenized_reviews = [self.tokenizer.pos(review, norm=True) for review in reviews]
        
        tokens = [[token for token, pos in tokenized_list] for tokenized_list in tokenized_reviews]
        tags = [[pos for token, pos in tokenized_list] for tokenized_list in tokenized_reviews]

        self.fasttext = FastText(sentences=[' '.join(review) for review in tokens],
                                 size=self.embedding_size,
                                 max_vocab_size=self.vocabulary_size-2)

        vocab_words = self.fasttext.wv.vocab
        word2idx = {word:idx for idx, word in enumerate(vocab_words)}
        word2idx['<UNK>'] = len(vocab_words)
        word2idx['<PAD>'] = len(vocab_words)+1
        
        idx2word = {idx:word for idx, word in enumerate(vocab_words)}
        idx2word[len(vocab_words)] = '<UNK>'
        idx2word[len(vocab_words)+1] = '<PAD>'
        
        return vocab_words, word2idx, idx2word

    def load_vectors(self):
        word_vectors = []
        for i in range(self.vocabulary_size):
            if i in self.idx2word:
                word = self.idx2word[i]
                if word in ['<UNK>', '<PAD>']:
                    vector = np.zeros(self.embedding_size)
                else : vector = self.fasttext.wv[word]
                word_vectors.append(vector)
            else:
                word_vectors.append(np.zeros(self.embedding_size))
        embedding = np.stack(word_vectors)
        return embedding

    def state_dict(self):
        state = {'idx2word': self.idx2word,
                 'word2idx': self.word2idx,
                 'vocab_words': self.vocab_words,
                 'embedding': self.embedding.tolist()}
        return state

    def load_state_dict(self, state_dict):
        self.idx2word = state_dict['idx2word']
        self.word2idx = state_dict['word2idx']
        self.vocab_words = state_dict['vocab_words']
        self.embedding = np.array(state_dict['embedding'])


In [11]:
class AbnormalWordExtractor:
    """
    되게 유의미할 것 같은 단어들 one-hot encoding
    """
    def __init__(self):
        self.n = None
        pass
    def fit(self, data):
        pass
    def extract_feature(self, raw_text, tokenized_text):
        abnormal_words_list = ['다세포', '형래', '우뢰매']
        
        self.n = len(abnormal_words_list)
        values = [0] * self.n
        
        for i, word in enumerate(abnormal_words_list):
            if word in raw_text: values[i]=1
        
        return tuple(values)

In [96]:
class ScoreExpressionExtractor:
    """
    Extracts score expressions
    """
    def __init__(self):
        self.re_score = re.compile("[1-9]?[0-9]점")
        self.re_star = re.compile("별 ?[0-9]?[0-9반] ?개")
        self.n = 1
        
    def fit(self, data):
        pass
    
    def extract_feature(self, raw_text, tokenized_text):
        """
        Returns:
            ??점을 말할 경우 그 값을 반환. 여러 개일 경우 마지막 값을 반환
        """
        values = [0]*10
        scores = self.re_score.findall(raw_text)
        stars = self.re_star.findall(raw_text)
        if stars:
            values[stars[-1]]=1
        elif scores:
            values[scores[-1]]=1
        return tuple(values)

In [40]:
from konlpy.tag import Twitter
twitter = Twitter()

In [43]:
class SleepnessExtractor:
    """
    Extracts 졸리다, 자다 expressions
    """
    def __init__(self):
        self.twitter = Twitter()
        self.n = 1
    
    def fit(self, data):
        pass
    
    def extract_feature(self, raw_text, tokenized_text):
        """
        Returns:
            졸리다라는 표현과 유사한 표현이 있는지 여부를 반환
        """
        sleepy = 0
        sleep_expressions = ['졸리다', '졸다', '자다', '자다']
        stem_tokens = self.twitter.pos(raw_text, norm=True, stem=True)
        for token, pos in stem_tokens:
            if token in sleep_expressions:
                sleepy=1
        return sleepy,

In [80]:
# normalizer
def delete_quote(raw_text):
    raw_text = raw_text.replace("'",'').replace('"','')
    if raw_text.find("10자")>-1:
        raw_text = raw_text[:raw_text.find("10자")]
    return raw_text

In [78]:
def sibalizer(raw_text):
    r = re.compile('씨[ㄱ-ㅎㅏ-ㅣ0-9]{,3}발|씨[ㄱ-ㅎㅏ-ㅣ0-9]{,3}팔|씨[ㄱ-ㅎㅏ-ㅣ0-9]{,3}빨|\
씨[ㄱ-ㅎㅏ-ㅣ0-9]{,3}벌|씨[ㄱ-ㅎㅏ-ㅣ0-9]{,3}바|씨[ㄱ-ㅎㅏ-ㅣ0-9]{,3}뻘|씨[ㄱ-ㅎㅏ-ㅣ0-9]{,3}펄|\
시[ㄱ-ㅎㅏ-ㅣ0-9]{,3}바|씨[ㄱ-ㅎㅏ-ㅣ0-9]{,3}파|씨[ㄱ-ㅎㅏ-ㅣ0-9]{,3}빠|시[ㄱ-ㅎㅏ-ㅣ0-9]{,3}빠|\
시[ㄱ-ㅎㅏ-ㅣ0-9]{,3}파|시[ㄱ-ㅎㅏ-ㅣ0-9]{,3}발|신[ㄱ-ㅎㅏ-ㅣ0-9]{,3}발|씨[ㄱ-ㅎㅏ-ㅣ0-9]{,3}방|\
ㅅ[ㄱ-ㅎㅏ-ㅣ0-9]{,3}ㅂ|ㅆ[ㄱ-ㅎㅏ-ㅣ0-9]{,3}발|ㅅ[ㄱ-ㅎㅏ-ㅣ0-9]{,3}발|ㅆ[ㄱ-ㅎㅏ-ㅣ0-9]{,3}팔')
    for sibal in r.findall(raw_text):
        raw_text = raw_text.replace(sibal,"시발")
    return raw_text

In [None]:
--model=VDCNN_feat --tokenizer=JamoMaskedTokenizer --dictionary=RandomDictionary --features=BasicFeaturesExtractor_ImportantWordFeaturesExtractor_MovieActorFeaturesExtractor --epochs=100 --sort_dataset --lr_schedule --learning_rate=0.003 --min_length=64 --max_length=100 --embedding_size=16