In [1]:
import numpy as np
import pandas as pd
from lxml import html

from passage.models import RNN
from passage.updates import Adadelta
from passage.layers import Embedding, GatedRecurrent, Dense
from passage.preprocessing import Tokenizer

Using gpu device 0: GRID K520


In [2]:
def clean(texts):
    return [html.fromstring(text).text_content().lower().strip() for text in texts]

In [3]:
tr_data = pd.read_csv('labeledTrainData.tsv', delimiter='\t') 
trX = clean(tr_data['review'].values)
trY = tr_data['sentiment'].values

print("Training data loaded and cleaned.")

tokenizer = Tokenizer(min_df=10, max_features=100000)
trX = tokenizer.fit_transform(trX)

print("Training data tokenized.")

layers = [
    Embedding(size=256, n_features=tokenizer.n_features),
    GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75),
    Dense(size=1, activation='sigmoid', init='orthogonal')
]

model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5))

Training data loaded and cleaned.
Training data tokenized.


  from scan_perform.scan_perform import *


In [5]:
tokenizer.n_features

18589

In [7]:
model.fit(trX, trY, n_epochs=30)

te_data = pd.read_csv('testData.tsv', delimiter='\t')
ids = te_data['id'].values
teX = clean(te_data['review'].values)
teX = tokenizer.transform(teX)
pr_teX = model.predict(teX).flatten()
 
pd.DataFrame(np.asarray([ids, pr_teX]).T).to_csv('submission.csv', index=False, header=["id", "sentiment"])

Epoch 0 Seen 24621 samples Avg cost 0.5218 Time elapsed 205 seconds
Epoch 1 Seen 49242 samples Avg cost 0.3855 Time elapsed 414 seconds
Epoch 2 Seen 73863 samples Avg cost 0.2971 Time elapsed 656 seconds
Epoch 3 Seen 98484 samples Avg cost 0.2663 Time elapsed 897 seconds
Epoch 4 Seen 123105 samples Avg cost 0.2386 Time elapsed 1138 seconds
Epoch 5 Seen 147726 samples Avg cost 0.2117 Time elapsed 1381 seconds
Epoch 6 Seen 172347 samples Avg cost 0.1950 Time elapsed 1624 seconds
Epoch 7 Seen 196968 samples Avg cost 0.1830 Time elapsed 1854 seconds
Epoch 8 Seen 221589 samples Avg cost 0.1651 Time elapsed 2062 seconds
Epoch 9 Seen 246210 samples Avg cost 0.1551 Time elapsed 2268 seconds
Epoch 10 Seen 270831 samples Avg cost 0.1457 Time elapsed 2507 seconds
Epoch 11 Seen 295452 samples Avg cost 0.1305 Time elapsed 2743 seconds
Epoch 12 Seen 320073 samples Avg cost 0.1199 Time elapsed 2975 seconds
Epoch 13 Seen 344694 samples Avg cost 0.1115 Time elapsed 3208 seconds
Epoch 14 Seen 369315 sam