In [1]:
import pandas as pd
import numpy as np
import numpy as np
from collections import Counter
from konlpy.tag import Twitter
from gensim.models import FastText

In [None]:
class TotalVectorizer:
    """
    A dictionary that maps a word into a vector which 3 kinds of embedding concatenated
    [16] jamo based embedding
    [128] word based embedding
    [128] stemminized word based embedding
    [8] POS tags based embeddding
    """

    def __init__(self, tokenizer, config):
        self.tokenizer = Twitter()
        self.vocabulary_size = config.vocabulary_size
        self.embedding_size = 256
        self.PAD_TOKEN = '<PAD>'
        self.UNK_TOKEN = '<UNK>'
        self.fasttext = None

    def build_dictionary(self, data):
        self.vocab_words, self.word2idx, self.idx2word = self._build_vocabulary(data)
        self.embedding = self.load_vectors()
        print(self.embedding.shape)

    def indexer(self, word):
        try:
            return self.word2idx[word]
        except:
            return self.word2idx['<UNK>']

    def _build_vocabulary(self, data):
        reviews = [review for review, label in data]
        tokenized_reviews = [self.tokenizer.pos(review, norm=True) for review in reviews]

        tokens = [[token for token, pos in tokenized_list] for tokenized_list in tokenized_reviews]
        tags = [[pos for token, pos in tokenized_list] for tokenized_list in tokenized_reviews]

        self.fasttext = FastText(sentences=[' '.join(review) for review in tokens],
                                 size=self.embedding_size,
                                 max_vocab_size=self.vocabulary_size - 2)

        vocab_words = self.fasttext.wv.vocab
        word2idx = {word: idx for idx, word in enumerate(vocab_words)}
        word2idx['<UNK>'] = len(vocab_words)
        word2idx['<PAD>'] = len(vocab_words) + 1

        idx2word = {idx: word for idx, word in enumerate(vocab_words)}
        idx2word[len(vocab_words)] = '<UNK>'
        idx2word[len(vocab_words) + 1] = '<PAD>'

        return vocab_words, word2idx, idx2word

    def load_vectors(self):
        word_vectors = []
        for i in self.idx2word:
            word = self.idx2word[i]
            if word in ['<UNK>', '<PAD>']:
                vector = np.zeros(self.embedding_size)
            else:
                vector = self.fasttext.wv[word]
            word_vectors.append(vector)
        embedding = np.stack(word_vectors)
        return embedding

    def state_dict(self):
        state = {'idx2word': self.idx2word,
                 'word2idx': self.word2idx,
                 'vocab_words': self.vocab_words,
                 'embedding': self.embedding.tolist()}
        return state

    def load_state_dict(self, state_dict):
        self.idx2word = state_dict['idx2word']
        self.word2idx = state_dict['word2idx']
        self.vocab_words = state_dict['vocab_words']
        self.embedding = np.array(state_dict['embedding'])

In [2]:
twitter = Twitter()

In [6]:
with open("../code/data/small/train/train_data") as f:
    reviews = f.readlines()
    reviews = [line.strip() for line in reviews]
with open("../code/data/small/train/train_label") as f:
    labels = f.readlines()
    labels = [line.strip() for line in labels]

In [24]:
import re

In [35]:
maskings = re.compile("mv[0-9]*"||"ac[0-9]*")

SyntaxError: invalid syntax (<ipython-input-35-b59cbc5a450a>, line 1)

In [33]:
r.findall("mv10009가 최고야 ac1010이 들어가다니!")

['mv10009']

In [None]:
def text_cleanser(raw_text):
    t = Twitter()
    t.pos()

In [None]:
cleaned_reviews = 

In [15]:
reviews_tagged = [twitter.pos(review) for review in reviews]

In [16]:
reviews_tokens = [[token for token, pos in review] for review in reviews_tagged]

In [18]:
reviews_pos = [[pos for token, pos in review] for review in reviews_tagged]

In [21]:
fasttext_word = FastText()

In [22]:
fasttext_word.build_vocab(reviews_tokens)

In [23]:
fasttext_word.wv.vocab

{'아련한': <gensim.models.keyedvectors.Vocab at 0x1a38f465f8>,
 '향수': <gensim.models.keyedvectors.Vocab at 0x1a38f46780>,
 '를': <gensim.models.keyedvectors.Vocab at 0x1a38f46c18>,
 '떠올리': <gensim.models.keyedvectors.Vocab at 0x1a38f46c50>,
 '게': <gensim.models.keyedvectors.Vocab at 0x1a38f46160>,
 '만드는': <gensim.models.keyedvectors.Vocab at 0x1a38f46978>,
 '추억': <gensim.models.keyedvectors.Vocab at 0x1a38f46ef0>,
 '의': <gensim.models.keyedvectors.Vocab at 0x1a38f46320>,
 '영화': <gensim.models.keyedvectors.Vocab at 0x1a38f46240>,
 'SF': <gensim.models.keyedvectors.Vocab at 0x1a38f465c0>,
 '코미디': <gensim.models.keyedvectors.Vocab at 0x1a38f467b8>,
 '영화사': <gensim.models.keyedvectors.Vocab at 0x1a38f46eb8>,
 '에서': <gensim.models.keyedvectors.Vocab at 0x1a38f46a20>,
 '가장': <gensim.models.keyedvectors.Vocab at 0x1a38f46390>,
 '해': <gensim.models.keyedvectors.Vocab at 0x1a38f46e48>,
 '야할': <gensim.models.keyedvectors.Vocab at 0x1a38f46da0>,
 'mv': <gensim.models.keyedvectors.Vocab at 0x1a38f46d3