### Load Dataset

In [1]:
from utils import *
import numpy as np

In [None]:
train_file = 'data/train.csv'
train_df = read_train_data(train_file)
sentA, sentB, label = train_df['title1_zh'].values, train_df['title2_zh'].values, train_df['label'].values

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\chiamin\AppData\Local\Temp\jieba.cache
Loading model cost 0.568 seconds.
Prefix dict has been built succesfully.


In [None]:
train_df.head()

In [None]:
# print(train_df['label'].value_counts())
train_df.hist(bins=10)

### Load Word Vector

In [None]:
wordvec_file = 'wordvector/zhwiki300-word2vec.txt'
UNK, PAD = '<UNK>', '<PAD>'
wordvector, word2index, vocab = load_wordvector(wordvec_file, UNK, PAD)
print('emb_dim: {}'.format(len(wordvector[0])))
print('vocab_size: {}'.format(len(vocab)))

### Process Training Data

In [None]:
sentA, sentB = process_unknown(sentA, sentB, set(vocab), UNK)
sentA, sentB = word_to_index(sentA, sentB, word2index)
seq_lenA = np.array([len(sent) for sent in sentA])
seq_lenB = np.array([len(sent) for sent in sentB])

In [None]:
np.save('data/sentA.npy', sentA)
np.save('data/sentB.npy', sentB)
np.save('data/seq_lenA.npy', seq_lenA)
np.save('data/seq_lenB.npy', seq_lenB)

### One-Hot Encode Label

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
onehotencoder = OneHotEncoder(sparse=False)
label = onehotencoder.fit_transform(label.reshape((-1, 1)))

In [None]:
np.save('data/label.npy', label)

### Build Graph

In [3]:
from tan import TAN


In [10]:
config = {
    'num_class': 3,
    'lstm_unit': 200,
    'emb_dim': len(wordvector[0]),
    'vocab_size': len(vocab),
    'learning_rate': 1e-2,
    'wordvec': wordvec_file,
    'emb_trainable': False
}
tan = TAN(config, wordvector)
tan.build()

### Train Model

In [None]:
sentA, sentB, seq_lenA, seq_lenB, label = \
    np.load('data/sentA.npy'), np.load('data/sentB.npy'), np.load('data/seq_lenA.npy'), np.load('data/seq_lenB.npy'), np.load('data/label.npy')
train_data, val_data = train_val_split(sentA, sentA, seq_lenA, seq_lenA, label, train_ratio=.7)

In [None]:
epoch_size = 3
batch_size = 32
model_name = 'tan'

In [None]:
tan.fit(train_data, val_data, epoch_size, batch_size, word2index, model_name)

### Test Data

In [None]:
test_df = read_test_data('data/test.csv')
test_id, test_sentA, test_sentB = test_df['id'].values, test_df['title1_zh'].values, test_df['title2_zh'].values

test_sentA, test_sentB = process_unknown(test_sentA, test_sentB, set(vocab), UNK)
test_sentA, test_sentB = word_to_index(test_sentA, test_sentB, word2index)
test_seq_lenA = np.array([len(sent) for sent in test_sentA])
test_seq_lenB = np.array([len(sent) for sent in test_sentB])

In [None]:
np.save('data/test_id.npy', test_id)
np.save('data/test_sentA.npy', test_sentA)
np.save('data/test_sentB.npy', test_sentB)
np.save('data/test_seq_lenA.npy', test_seq_lenA)
np.save('data/test_seq_lenB.npy', test_seq_lenB)

### Predict

In [None]:
config = {
    'num_class': 3,
    'lstm_unit': 200,
    'emb_dim': len(wordvector[0]),
    'vocab_size': len(vocab),
    'learning_rate': 1e-2,
    'wordvec': wordvec_file,
    'emb_trainable': False
}

In [None]:
tan = TAN(config, wordvector)
tan.build()
tan.restore('models/{}/{}.ckpt'.format(model_name, model_name))

In [None]:
test_id, test_sentA, test_sentB, test_seq_lenA, test_seq_lenB = \
    np.load('data/test_id.npy'), np.load('data/test_sentA.npy'), np.load('data/test_sentB.npy'), np.load('data/test_seq_lenA.npy'), np.load('data/test_seq_lenB.npy')
test_data = [test_sentA, test_sentB, test_seq_lenA, test_seq_lenB, np.empty(0)]

In [None]:
prediction = tan.predict(test_data, word2index)

In [None]:
result_file = 'models/{}/result/result.csv'.format(model_name)
prediction_to_csv(test_id, prediction, result_file)