In [80]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout, GRU
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint


In [65]:
max_features = 20000 # vocabulary size , # of unique words to be used.
maxlen = 100         # maximum number of words to be used for each comment
# embed_size = 50      # length of vector for each word.

train = pd.read_csv("mergedDataSet.csv",usecols=['comment_text','merged_rating'])
train = train.sample(frac=1,random_state=42)

list_sentences_train = train["comment_text"].fillna("None").values
list_classes = ['merged_rating']
y = train[list_classes].values

one_hot_encoder = OneHotEncoder(sparse=False)
y = one_hot_encoder.fit_transform(y)


tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)

X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)


In [81]:
def get_model():
    embed_size = 128
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Bidirectional(GRU(50, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(3, activation="softmax")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [82]:
model = get_model()
batch_size = 32
epochs = 2
model.fit(X_t,y, batch_size=batch_size, epochs=epochs, validation_split=0.2)

Train on 128491 samples, validate on 32123 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1e7350319e8>

In [83]:
model.save("GRU.h5")

In [76]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 100, 128)          2560000   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 100, 100)          71600     
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 100)               0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 50)                5050      
_________________________________________________________________
dropout_6 (Dropout)          (None, 50)                0         
__________