In [None]:
import numpy as np
import tensorflow as tf
print(tf.reduce_mean([[1,1],[2,3]], axis=0))

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
'''Trains an LSTM model on the IMDB sentiment classification task.
The dataset is actually too small for LSTM to be of any advantage
compared to simpler, much faster methods such as TF-IDF + LogReg.
# Notes
- RNNs are tricky. Choice of batch size is important,
choice of loss and optimizer is critical, etc.
Some configurations won't converge.
- LSTM loss decrease patterns during training can be quite different
from what you see with CNNs/MLPs/etc.
'''
import pandas as pd
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.layers import SpatialDropout1D, Dropout, Conv1D, MaxPooling1D
from keras.layers import Bidirectional
max_features = 20000
maxlen = 80  # cut texts after this number of words (among top max_features most common words)
batch_size = 1024

df = pd.read_csv(r"../input/train.tsv", sep='\t')
X = df['Phrase']
y = df['Sentiment']

tokenizer = Tokenizer(num_words=max_features)
y = to_categorical(y)
x_train, x_test, y_train, y_test = train_test_split(X, y)
tokenizer.fit_on_texts(x_train)
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

def create_ngram_set(input_list, ngram_value=2):
    """
    Extract a set of n-grams from a list of integers.
    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
    {(4, 9), (4, 1), (1, 4), (9, 4)}
    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
    [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
    """
    return set(zip(*[input_list[i:] for i in range(ngram_value)]))

def add_ngram(sequences, token_indice, ngram_range=2):
    """
    Augment the input list of list (sequences) by appending n-grams values.
    Example: adding bi-gram
    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
    >>> add_ngram(sequences, token_indice, ngram_range=2)
    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
    Example: adding tri-gram
    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
    >>> add_ngram(sequences, token_indice, ngram_range=3)
    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42, 2018]]
    """
    new_sequences = []
    for input_list in sequences:
        new_list = input_list[:]
        for ngram_value in range(2, ngram_range + 1):
            for i in range(len(new_list) - ngram_value + 1):
                ngram = tuple(new_list[i:i + ngram_value])
                if ngram in token_indice:
                    new_list.append(token_indice[ngram])
        new_sequences.append(new_list)

    return new_sequences

ngram_range = 2
if ngram_range > 1:
    print('Adding {}-gram features'.format(ngram_range))
    # Create set of unique n-gram from the training set.
    ngram_set = set()
    for input_list in x_train:
        for i in range(2, ngram_range + 1):
            set_of_ngram = create_ngram_set(input_list, ngram_value=i)
            ngram_set.update(set_of_ngram)

    # Dictionary mapping n-gram token to a unique integer.
    # Integer values are greater than max_features in order
    # to avoid collision with existing features.
    start_index = max_features + 1
    token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
    indice_token = {token_indice[k]: k for k in token_indice}

    # max_features is the highest integer that could be found in the dataset.
    max_features = np.max(list(indice_token.keys())) + 1

    # Augmenting x_train and x_test with n-grams features
    x_train = add_ngram(x_train, token_indice, ngram_range)
    x_test = add_ngram(x_test, token_indice, ngram_range)
    print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
    print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))



print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Build model...')
model_1 = Sequential()
model_1.add(Embedding(max_features, 123))
model_1.add(SpatialDropout1D(0.2))
model_1.add(Dropout(0.25))
model_1.add(Bidirectional(LSTM(512, dropout=0.2, recurrent_dropout=0.2)))
model_1.add(Dropout(0.5))
model_1.add(Dense(5, activation='softmax'))

model_1.summary()
# try using different optimizers and different optimizer configs
model_1.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['categorical_accuracy'])

print('Train...')
model_1.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=10,
          validation_data=(x_test, y_test))
score, acc = model_1.evaluate(x_test, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

In [None]:
from keras.layers import GRU
print('Build model...')
model_2 = Sequential()
model_2.add(Embedding(max_features, 128))
model_2.add(SpatialDropout1D(0.2))
model_2.add(Dropout(0.25))
model_2.add(Bidirectional(GRU(512, dropout=0.2, recurrent_dropout=0.2)))
model_2.add(Dropout(0.5))
model_2.add(Dense(5, activation='softmax'))

# try using different optimizers and different optimizer configs
model_2.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['categorical_accuracy'])

print('Train...')
model_2.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=1,
          validation_data=(x_test, y_test))
score, acc = model_2.evaluate(x_test, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

In [None]:
print('Build model...')
from keras.layers import Average 
from keras.layers import TimeDistributed, Lambda, Input, merge
from keras import Model
from keras import backend as K

input_x = Input((maxlen,))
x = Embedding(max_features, 128)(input_x)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(LSTM(512, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))(x)
x = Dropout(0.5)(x)
x = TimeDistributed(Dense(5, activation='softmax'))(x)
print(type(x))
out = Lambda(lambda x:K.mean(x, 1))(x)
model_3 = Model(input_x, out)

# try using different optimizers and different optimizer configs

model_3.summary()
model_3.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['categorical_accuracy'])

print('Train...')
model_3.fit(x_train, y_train,batch_size=batch_size,epochs=1,validation_data=(x_test, y_test))
score, acc = model_3.evaluate(x_test, y_test,batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

In [None]:
print('Build model...')
from keras.layers import Average 
from keras.layers import TimeDistributed, Lambda, Input, merge
from keras import Model
from keras import backend as K

input_x = Input((maxlen,))
x = Embedding(max_features, 128)(input_x)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(GRU(512, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))(x)
x = Dropout(0.5)(x)
x = TimeDistributed(Dense(5, activation='softmax'))(x)
print(type(x))
out = Lambda(lambda x:K.mean(x, 1))(x)
model_4 = Model(input_x, out)

# try using different optimizers and different optimizer configs

model_4.summary()
model_4.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['categorical_accuracy'])

print('Train...')
model_4.fit(x_train, y_train,batch_size=batch_size,epochs=1,validation_data=(x_test, y_test))
score, acc = model_4.evaluate(x_test, y_test,batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

In [None]:

from keras.layers import add
from keras.layers import Merge

input_x = Input((maxlen,))
x = Embedding(max_features, 128)(input_x)
t = SpatialDropout1D(0.2)(x)
x = Bidirectional(LSTM(512, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))(t)
model_t = Model(input_x, x)

# try using different optimizers and different optimizer configs



t = Flatten()(t)
in_y = Dense(input_dim=128, output_dim=maxlen)(t)
y = Activation('softmax')(in_y) # Learn a probability distribution over each  step.
#Reshape to match LSTM's output shape, so that we can do element-wise multiplication.
y = RepeatVector(1024)(y)
y = Permute((2, 1))(y)
attmodel = Model(input_x, y)

finalmodel = Sequential()
finalmodel.add(Merge([model_t, attmodel], 'mul'))  # Multiply each element with corresponding weight a[i][j][k] * b[i][j]
finalmodel.add(Dropout(0.5))
finalmodel.add(TimeDistributed(Dense(5, activation='softmax')))
finalmodel.add(Lambda(lambda x:K.mean(x, 1)))


finalmodel.summary()
finalmodel.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['categorical_accuracy'])

finalmodel.fit(x_train, y_train,batch_size=batch_size,epochs=3,validation_data=(x_test, y_test))



In [None]:
#ensemble
s = model_1.predict(x_test, verbose=1,  batch_size=batch_size)
for model in [model_2, model_3, model_4]:
    s += model.predict(x_test, verbose=1, batch_size=batch_size)
x = np.argmax(s,1)

np.mean(np.equal(x,np.argmax(y_test, 1)))

In [None]:
testdf = pd.read_csv('../input/test.tsv', sep='\t')
test = testdf['Phrase']
test = tokenizer.texts_to_sequences(test)
test = sequence.pad_sequences(test, maxlen=maxlen)
#ensemble
s = model_1.predict(test, verbose=True,  batch_size=batch_size)
for model in [model_2, model_3, model_4]:
    s += model.predict(test, verbose=True,  batch_size=batch_size)
x = np.argmax(s,1)


In [None]:

print(x)

out = pd.DataFrame()
out['PhraseId'] =testdf['PhraseId']
out['Sentiment'] = x
out.to_csv('rs.csv', index=False)

In [None]:
!cat rs.csv