In [28]:
import warnings
import numpy as np
import string
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import keras

import re

from sklearn import random_projection
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder

from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D, GRU, CuDNNGRU, CuDNNLSTM, BatchNormalization
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Add, Flatten
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras.models import Model, load_model

from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras import backend as K
from keras.engine import InputSpec, Layer
from keras.optimizers import Adam

# from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops

from keras.models import Sequential
from keras.layers.wrappers import TimeDistributed
from keras.layers.pooling import GlobalAveragePooling1D
from keras.optimizers import SGD
from keras.utils import np_utils
from keras.models import Model
from keras import metrics
from keras.callbacks import ModelCheckpoint, TensorBoard, Callback, EarlyStopping

stop_words = set(stopwords.words('english') + list(string.punctuation))

warnings.filterwarnings('ignore')

In [2]:
Train = pd.read_csv("data/train.csv")
Test = pd.read_csv("data/test.csv")
Valid = pd.read_csv("data/valid.csv")

#################################################################
Train = Train.iloc[0:10000,:]
#################################################################

In [3]:
# Label
Train_y = Train['stars']
Test_y = Valid['stars']

In [4]:
print('Average word length of comments in train is {0:.0f}.'.format(np.mean(Train['text'].apply(lambda x: len(x.split())))))
print('Average word length of comments in test is {0:.0f}.'.format(np.mean(Test['text'].apply(lambda x: len(x.split())))))

Average word length of comments in train is 112.
Average word length of comments in test is 111.


In [5]:
tokenizer = TweetTokenizer()

vectorizer = TfidfVectorizer(ngram_range=(1, 2), tokenizer=tokenizer.tokenize)
full_text = list(Train['text'].values) + list(Test['text'].values)
vectorizer.fit(full_text)
train_vectorized = vectorizer.transform(Train['text'])
test_vectorized = vectorizer.transform(Test['text'])

In [6]:
tk = Tokenizer(lower = True, filters='')
tk.fit_on_texts(full_text)

In [7]:
train_tokenized = tk.texts_to_sequences(Train['text'])
test_tokenized = tk.texts_to_sequences(Test['text'])

In [55]:
max_len = 1
x_train_padded_seqs = pad_sequences(train_tokenized, maxlen = max_len)
x_test_padded_seqs = pad_sequences(test_tokenized, maxlen = max_len)

In [79]:
x_test_padded_seqs_split=[]
for i in range(x_test_padded_seqs.shape[0]):
    split1=np.split(x_test_padded_seqs[i],5)
    a=[]
    for j in range(5):
        s=np.split(split1[j],5)
        a.append(s)
    x_test_padded_seqs_split.append(a)
    
x_train_padded_seqs_split=[]
for i in range(x_train_padded_seqs.shape[0]):
    split1=np.split(x_train_padded_seqs[i],5)
    a=[]
    for j in range(5):
        s=np.split(split1[j],5)
        a.append(s)

embedding_path = "../crawl-300d-2M.vec"

embed_size = 300
max_features = 20000

def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in open(embedding_path))

word_index = tk.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [6]:
ohe = OneHotEncoder(sparse=False)
y_ohe = ohe.fit_transform(Train_y.values.reshape(-1, 1))

In [11]:
input_size = 10
embedding_size = 10
input_length = 30
filters = 10

In [19]:
inputdict = {}
grudict = {}
Encoderdict = {}
embeddict = {}
shapelist = (int(64//64),)

inputdict['input1'] = Input(shape=shapelist, dtype='int32')
embeddict['embed'] = Embedding(input_dim=input_size, output_dim=embedding_size, input_length=input_length)(inputdict['input1'])
grudict['gru1'] = GRU(filters,recurrent_activation='sigmoid',activation=None,return_sequences=False)(embeddict['embed'])
grudict['gru1'] =Dropout(0.5)(grudict['gru1'])
Encoderdict['Encoder1'] = Model(inputdict['input1'], grudict['gru1'])


for i in range(2,10):
    
    shapelist = (5,) + shapelist
    inputname = 'input'+str(i)
    Encodernameprev = 'Encoder'+ str(i-1)
    Encodername = 'Encoder'+str(i)
    embedname = 'embed'+str(i)
    gruname = 'gru'+str(i)
    
    inputdict[inputname] = Input(shape=shapelist, dtype='int32')
    embeddict[embedname] = TimeDistributed(Encoderdict[Encodernameprev])(inputdict[inputname])
    grudict[gruname] = GRU(filters,recurrent_activation='sigmoid',activation=None,return_sequences=False)(embeddict[embedname])
    grudict[gruname] =Dropout(0.5)(grudict[gruname])
    Encoderdict[Encodername] = Model(inputdict[inputname],grudict[gruname])

In [20]:
preds = Dense(5, activation='softmax')(grudict['gru3'])
model = Model(inputdict['input3'], preds)

# print(Encoder1.summary())
# print(Encoder2.summary())
print(model.summary())

from keras.optimizers import Adam
opt = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)

model.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['acc'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_40 (InputLayer)        (None, 5, 5, 1)           0         
_________________________________________________________________
time_distributed_37 (TimeDis (None, 5, 10)             1360      
_________________________________________________________________
gru_39 (GRU)                 (None, 10)                630       
_________________________________________________________________
dropout_39 (Dropout)         (None, 10)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 55        
Total params: 2,045
Trainable params: 2,045
Non-trainable params: 0
_________________________________________________________________
None


In [21]:
savebestmodel = 'SRNN(10).h5'
checkpoint = ModelCheckpoint(savebestmodel, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
earlyStopping=EarlyStopping(monitor='val_acc', patience=10, verbose=1, mode='max')

In [22]:
class Metrics(Callback):
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(
            self.validation_data[0]))).round()
        val_targ = self.validation_data[1]
        _val_f1 = f1_score(val_targ, val_predict, average='weighted')
        _val_recall = recall_score(val_targ, val_predict, average='weighted')
        _val_precision = precision_score(
            val_targ, val_predict, average='weighted')
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        print(' — val_f1: %f — val_precision: %f — val_recall %f' %
              (_val_f1, _val_precision, _val_recall))
        return


metrics = Metrics()


callbacks = [checkpoint, metrics, earlyStopping]

history = model.fit(train_vectorized, y_ohe,
                    epochs=10,
                    batch_size=100,
                    callbacks=callbacks,
                    verbose=1)

ValueError: Error when checking input: expected input_40 to have 4 dimensions, but got array with shape (10000, 562328)

In [27]:
y_ohe

array([[0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.]])

In [88]:
file_path = "best_model.hdf5"
check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                              save_best_only = True, mode = "min")
early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 3)

def build_model(lr = 0.0, lr_d = 0.0, units = 0, dr = 0.0):
    inp = Input(shape = (max_len,))
    x = Embedding(input_dim=input_size, output_dim=embedding_size, input_length=input_length)(inp)
    x1 = SpatialDropout1D(dr)(x)

    x_gru = Bidirectional(CudnnRNN(units, return_sequences = True))(x1)
    x1 = Conv1D(32, kernel_size=3, padding='valid', kernel_initializer='he_uniform')(x_gru)
    avg_pool1_gru = GlobalAveragePooling1D()(x1)
    max_pool1_gru = GlobalMaxPooling1D()(x1)
    
    x3 = Conv1D(32, kernel_size=2, padding='valid', kernel_initializer='he_uniform')(x_gru)
    avg_pool3_gru = GlobalAveragePooling1D()(x3)
    max_pool3_gru = GlobalMaxPooling1D()(x3)
    
    x_lstm = Bidirectional(CuDNNLSTM(units, return_sequences = True))(x1)
    x1 = Conv1D(32, kernel_size=3, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    avg_pool1_lstm = GlobalAveragePooling1D()(x1)
    max_pool1_lstm = GlobalMaxPooling1D()(x1)
    
    x3 = Conv1D(32, kernel_size=2, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    avg_pool3_lstm = GlobalAveragePooling1D()(x3)
    max_pool3_lstm = GlobalMaxPooling1D()(x3)
    
    
    x = concatenate([avg_pool1_gru, max_pool1_gru, avg_pool3_gru, max_pool3_gru,
                    avg_pool1_lstm, max_pool1_lstm, avg_pool3_lstm, max_pool3_lstm])
    x = BatchNormalization()(x)
    x = Dropout(0.2)(Dense(128,activation='relu') (x))
    x = BatchNormalization()(x)
    x = Dropout(0.2)(Dense(100,activation='relu') (x))
    x = Dense(5, activation = "sigmoid")(x)
    
    model = Model(inputs = inp, outputs = x)
    model.compile(loss = "categorical_crossentropy", optimizer = Adam(lr = lr, decay = lr_d), metrics = ["accuracy"])
    history = model.fit(X_train, y_ohe, batch_size = 128, epochs = 15, validation_split=0.1, 
                        verbose = 1, callbacks = [check_point, early_stop])
    model = load_model(file_path)
    return model

In [89]:
model = build_model(lr = 1e-4, lr_d = 0, units = 128, dr = 0.5)

AttributeError: 'int' object has no attribute 'get_config'

In [None]:
pred = model.predict(X_test, batch_size = 1024)

In [None]:
predictions = np.round(np.argmax(pred, axis=1)).astype(int)
sub['Sentiment'] = predictions
sub.to_csv("blend.csv", index=False)