# 180408_FastTextVectorizer

지금 FastTextVectorizer + WordCNN의 문제는 WordCNN에 맞게 적절한 tokenizing이 되지 않았다는 점입니다. 따라서 TwitterTokenizer의 토큰별로 embedding을 하는 vectorizer와 각각의 자모별로 embedding을 하여 그 토큰에 붙이는 형식으로 하겠습니다.

- TwitterTokenizer : 얘는 일단 영화와 배우를 masking하고 나머지는 Pos tagging 하겠습니다. return값이 다른 것들과 다릅니다. 
- FastTextTokenVectorizer : FastText기반으로 token을 학습하여 vector로 만듭니다.

In [13]:
from konlpy.tag import Twitter
import re

In [9]:
Twitter().pos("🐱꽃이 피었습니다.", stem=True)

[('\uf431', 'Foreign'),
 ('꽃', 'Noun'),
 ('이', 'Josa'),
 ('피다', 'Verb'),
 ('.', 'Punctuation')]

In [15]:
class TwitterTokenizer:
    """Split text to twitter based tokens"""
    
    def __init__(self, config):
        self.twitter = Twitter()
        self.mv = re.compile(r'mv[0-9]{2,10}')
        self.ac = re.compile(r'ac[0-9]{2,10}') 
    
    def tokenize(self, raw_text, stem=False):
        """
        Args:
            raw_text: "무궁화 꽃이 피었습니다."
        Returns:
            먼저 영화id와 배우id를 masking
            A list of (token, pos) : [("무궁화","Noun"), ("꽃","Noun")...] 
        """
        mv_replaced = self.mv.sub('🐶', raw_text)
        ac_replaced = self.ac.sub('🐱', mv_replaced)
        tokenized_text = self.twitter.pos(ac_replaced, stem=stem)
        idx_mv = []
        idx_ac = []
        for i, (token, pos) in enumerate(tokenized_text):
            if token=='\uf436':
                idx_mv.append(i)
            elif token=='\uf431':
                idx_ac.append(i)
                
        for i in idx_mv:
            tokenized_text[i] = ('🐶', 'Movie')
        for i in idx_ac:
            tokenized_text[i] = ('🐱', 'Actor')
            
        return tokenized_text

In [20]:
from tokenizers import TwitterTokenizer

class FastTextVectorizer:
    """A dictionary that maps a word to FastText embedding."""

    def __init__(self, tokenizer, config):
        self.tokenizer = TwitterTokenizer(config)
        self.vocabulary_size = config.vocabulary_size
        self.embedding_size = config.embedding_size
        self.PAD_TOKEN = '<PAD>'
        self.UNK_TOKEN = '<UNK>'
        self.fasttext = None

    def build_dictionary(self, data):

        self.vocab_words, self.word2idx, self.idx2word = self._build_vocabulary(data)
        self.embedding = self.load_vectors()
        print(self.embedding.shape)

    def indexer(self, word):
        try:
            return self.word2idx[word]
        except:
            return self.word2idx['<UNK>']

    def _build_vocabulary(self, data):
        reviews = [review for review, label in data]
        tokenized_reviews = [self.tokenizer.tokenize(review, stem=False) for review in reviews]

        tokens = [[token for token, pos in tokenized_list] for tokenized_list in tokenized_reviews]
        tags = [[pos for token, pos in tokenized_list] for tokenized_list in tokenized_reviews]

        self.fasttext = FastText(sentences=tokens,
                                 size=self.embedding_size,
                                 max_vocab_size=self.vocabulary_size - 2)

        vocab_words = self.fasttext.wv.vocab
        word2idx = {word: idx for idx, word in enumerate(vocab_words)}
        word2idx['<UNK>'] = len(vocab_words)
        word2idx['<PAD>'] = len(vocab_words) + 1

        idx2word = {idx: word for idx, word in enumerate(vocab_words)}
        idx2word[len(vocab_words)] = '<UNK>'
        idx2word[len(vocab_words) + 1] = '<PAD>'

        return vocab_words, word2idx, idx2word

    def load_vectors(self):
        word_vectors = []
        vocab_num = len(self.vocab_words)
        for i in range(self.vocabulary_size):
            if i < vocab_num:
                word = self.idx2word[i]
                if word in ['<UNK>', '<PAD>']:
                    vector = np.zeros(self.embedding_size)
                else:
                    vector = self.fasttext.wv[word]
                word_vectors.append(vector)
            else:
                word_vectors.append(np.zeros(self.embedding_size))
        embedding = np.stack(word_vectors)
        return embedding

    def state_dict(self):
        state = {'idx2word': self.idx2word,
                 'word2idx': self.word2idx,
                 'vocab_words': self.vocab_words,
                 'embedding': self.embedding.tolist()}
        return state

    def load_state_dict(self, state_dict):
        self.idx2word = state_dict['idx2word']
        self.word2idx = state_dict['word2idx']
        self.vocab_words = state_dict['vocab_words']
        self.embedding = np.array(state_dict['embedding'])

ImportError: cannot import name 'TwitterTokenizer'