In [1]:
from mxnet import gluon, init, nd
from mxnet.contrib import text
from mxnet.gluon import loss as gloss, nn

import sys

  from ._conv import register_converters as _register_converters


In [2]:
def corr1d(X, K):
    w = K.shape[0]
    Y = nd.zeros((X.shape[0] - w + 1))
    for i in range(Y.shape[0]):
        Y[i] = (X[i: i + w] * K).sum()
    return Y

In [3]:
X, K = nd.array([0, 1, 2, 3, 4, 5, 6]), nd.array([1, 2])
corr1d(X, K)


[ 2.  5.  8. 11. 14. 17.]
<NDArray 6 @cpu(0)>

In [4]:
def corr1d_multi_in(X, K):
    # First, we traverse along the 0th dimension (channel dimension) of X and
    # K. Then, we add them together by using * to turn the result list into a
    # positional argument of the add_n function
    return nd.add_n(*[corr1d(x, k) for x, k in zip(X, K)])

In [5]:
X = nd.array([[0, 1, 2, 3, 4, 5, 6],
              [1, 2, 3, 4, 5, 6, 7],
              [2, 3, 4, 5, 6, 7, 8]])
K = nd.array([[1, 2], [3, 4], [-1, -3]])
corr1d_multi_in(X, K)


[ 2.  8. 14. 20. 26. 32.]
<NDArray 6 @cpu(0)>

In [6]:
class TextCNN(nn.Block):
    def __init__(self, vocab_size, embed_size, kernel_sizes, num_channels,
                 **kwargs):
        super(TextCNN, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        # The embedding layer does not participate in training
        self.constant_embedding = nn.Embedding(vocab_size, embed_size)
        self.dropout = nn.Dropout(0.5)
        self.decoder = nn.Dense(2)
        # The max-over-time pooling layer has no weight, so it can share an
        # instance
        self.pool = nn.GlobalMaxPool1D()
        # Create multiple one-dimensional convolutional layers
        self.convs = nn.Sequential()
        for c, k in zip(num_channels, kernel_sizes):
            self.convs.add(nn.Conv1D(c, k, activation='relu'))

    def forward(self, inputs):
        # Concatenate the output of two embedding layers with shape of
        # (batch size, number of words, word vector dimension) by word vector
        embeddings = nd.concat(
            self.embedding(inputs), self.constant_embedding(inputs), dim=2)
        # According to the input format required by Conv1D, the word vector
        # dimension, that is, the channel dimension of the one-dimensional
        # convolutional layer, is transformed into the previous dimension
        embeddings = embeddings.transpose((0, 2, 1))
        # For each one-dimensional convolutional layer, after max-over-time
        # pooling, an NDArray with the shape of (batch size, channel size, 1)
        # can be obtained. Use the flatten function to remove the last
        # dimension and then concatenate on the channel dimension
        encoding = nd.concat(*[nd.flatten(
            self.pool(conv(embeddings))) for conv in self.convs], dim=1)
        # After applying the dropout method, use a fully connected layer to
        # obtain the output
        outputs = self.decoder(self.dropout(encoding))
        return outputs

In [7]:
import pandas as pd

train_data = pd.read_csv('ratings_train.txt', sep='\t').dropna(axis=0)
train_data.head()
train_corpus = train_data['document']

In [8]:
# co-occurrence matrix generate
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=10, ngram_range=(1,1))
X = vectorizer.fit_transform(train_corpus)
Xc = X.T * X             # co-occurrence matrix
Xc.setdiag(0)            #대각성분을 0으로
result = Xc.toarray()    # array로 변환
dic = {}
for idx1, word1 in enumerate(result):
    tmpdic = {}
    for idx2, word2 in enumerate(word1):
        if word2 > 0:
            tmpdic[idx2] = word2
    dic[idx1] = tmpdic

In [9]:
# 단어 리스트 작성
import operator
vocab = sorted(vectorizer.vocabulary_.items(), key=operator.itemgetter(1))
vocab = [word[0] for word in vocab]

In [11]:
!pip install git+https://github.com/JonathanRaiman/glove.git

Collecting git+https://github.com/JonathanRaiman/glove.git
  Cloning https://github.com/JonathanRaiman/glove.git to /tmp/pip-req-build-g1gwl6ry
Building wheels for collected packages: glove
  Running setup.py bdist_wheel for glove ... [?25ldone
[?25h  Stored in directory: /tmp/pip-ephem-wheel-cache-o3smc2h6/wheels/e4/ea/7c/0e887c01470d73c6b0f3395891804fc2923caca44dd76cdedc
Successfully built glove
Installing collected packages: glove
Successfully installed glove-1.0.1
[33mYou are using pip version 18.0, however version 19.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [12]:
# training
import glove
model = glove.Glove(dic, d=100, alpha=0.75, x_max=100.0)
for epoch in range(25):
    err = model.train(batch_size=200, workers=4)
    print("epoch %d, error %.3f" % (epoch, err), flush=True)

epoch 0, error 0.010
epoch 1, error 0.008
epoch 2, error 0.006
epoch 3, error 0.006
epoch 4, error 0.006
epoch 5, error 0.005
epoch 6, error 0.005
epoch 7, error 0.005
epoch 8, error 0.005
epoch 9, error 0.005
epoch 10, error 0.005
epoch 11, error 0.005
epoch 12, error 0.004
epoch 13, error 0.004
epoch 14, error 0.004
epoch 15, error 0.004
epoch 16, error 0.004
epoch 17, error 0.004
epoch 18, error 0.004
epoch 19, error 0.004
epoch 20, error 0.004
epoch 21, error 0.003
epoch 22, error 0.003
epoch 23, error 0.003
epoch 24, error 0.003


In [13]:
embedding_model = model
embedding_model

<glove.glove.Glove at 0x7f31615e5400>

In [18]:
len(vectorizer.vocabulary_)

11940

In [19]:
embedding_model.W.shape

(11940, 100)

In [None]:
embedding_model.ContextW