In [83]:
# Importing libraries
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D, Conv3D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# Our dictionary will contain only of the top 7000 words appearing most frequently

# Now we split our data-set into training and test data

# Looking at the nature of training data

In [6]:
import imports as ii
import functions as f
import preprocessing as pp
import neuralnetworks as nn
import trainRNN as trainRNN

In [92]:
PATH = {}
PATH["dataset_classification"] = "dataset/classification/"
PATH["dataset_labeling"] = "dataset/seq_labeling/"
PATH["music_reviews_train"] = PATH["dataset_classification"] + "music_reviews_train.json.gz"
PATH["music_reviews_dev"] = PATH["dataset_classification"] + "music_reviews_dev.json.gz"
PATH["music_reviews_test"] = PATH["dataset_classification"] + "music_reviews_test.json.gz"
train = f.readJson(PATH["music_reviews_train"])
test = f.readJson(PATH["music_reviews_dev"])
test_true = f.readJson(PATH["music_reviews_test"])

Number of data:  100000
Number of data:  10000
Number of data:  10000


In [105]:
X_train, y_train, train_idx, train_missing_idx = f.json_divide(train)
X_dev, y_dev, test_idx, test_missing_idx = f.json_divide(test)
X_test, y_test, test_idx, test_missing_idx = f.json_divide(test)

In [106]:
combination = [1, 1, 1, 1, 1, 1]
combination = [1, 1, 0, 1, 1, 0]
data_sets, y_train, y_test = f.grid_search_retrain(X_train, X_test, y_train, y_test, combination)

In [107]:
X_train = data_sets[0][1]
X_test = data_sets[0][2]

In [108]:
tokenizer = pp.tokenizer_init(X_train, X_test, X_test)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [109]:
vocabulary_size = len(tokenizer.word_counts)

In [110]:
# Padding the data samples to a maximum review length in words
max_words = 70
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)
# Building the CNN Model
model = Sequential()      # initilaizing the Sequential nature for CNN model
# Adding the embedding layer which will take in maximum of 450 words as input and provide a 32 dimensional output of those words which belong in the top_words dictionary
model.add(Embedding(vocabulary_size+1, 32, input_length=max_words))
model.add(Conv1D(32, 1, padding='same', activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [111]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, 70, 32)            2244480   
                                                                 
 conv1d_9 (Conv1D)           (None, 70, 32)            1056      
                                                                 
 max_pooling1d_9 (MaxPooling  (None, 35, 32)           0         
 1D)                                                             
                                                                 
 flatten_9 (Flatten)         (None, 1120)              0         
                                                                 
 dense_18 (Dense)            (None, 250)               280250    
                                                                 
 dense_19 (Dense)            (None, 1)                 251       
                                                     

In [112]:
sent_dict = {"positive": 1, "negative": 0}
y_train = pp.sentiment_converter(y_train, sent_dict)
y_dev = pp.sentiment_converter(y_dev, sent_dict)
y_test = pp.sentiment_converter(y_test, sent_dict)

In [113]:
import numpy as np
X_test = np.array(X_test)
X_train = np.array(X_train)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [114]:
# Fitting the data onto model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=4, batch_size=100, verbose=1)
# Getting score metrics from our model
scores = model.evaluate(X_test, y_test, verbose=1)
# Displays the accuracy of correct sentiment prediction over test data
print("Accuracy: %.2f%%" % (scores[1]*100))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Accuracy: 88.79%
