In [2]:
import os
import random
import numpy as np

from collections import namedtuple

from sklearn.preprocessing import LabelEncoder

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# Stanford Sentiment Treebank - movie reviews with fine-grained labels

In [3]:
# Stanford Sentiment Treebank - movie reviews with fine-grained labels
# https://nlp.stanford.edu/sentiment/

ST_sentence = namedtuple("Stanford_Sentiment", "id sentence")
ST_score = namedtuple("Stanford_Sentiment", "id score")

sentences = dict()
scores = dict()
train = []
dev = []
test = []

for filename in ['datasetSentences.txt','datasetSplit.txt', 'sentiment_labels.txt']:
    with open("ST/"+filename,'r') as f_input:
        for line in f_input:
            
            # skip headers
            if line.startswith("sentence_index") or line.startswith('phrase id'):
                continue
            
            # load sentences
            if filename=='datasetSentences.txt':                                
                sent_id, sentence = line.split('\t', 1)
                sentences[sent_id] = sentence.strip()
            
            # load splits
            if filename=='datasetSplit.txt':
                sent_id, split = line.split(',', 1)
                split = int(split.strip())
                if split == 1:                    
                    train.append(sent_id)
                if split == 2:                                        
                    test.append(sent_id)
                if split == 3:
                    dev.append(sent_id)
            
            # sentences_id        
            if filename=='sentiment_labels.txt':                    
                sent_id, sent_score = line.split('|', 1)
                #sent_score = float(sent_score.strip())
                sample = ST_score(sent_id, float(sent_score.strip()))
                scores[sent_id] = sent_score.strip()

# Samples and Classes/Labels

In [4]:
print("Total Nr. Samples: {}".format(len(sentences)))
print("Total Nr. Scores : {}".format(len(scores)))
print()
print("Train  : {}".format(len(train)))
print("Dev    : {}".format(len(dev)))
print("Test   : {}".format(len(test)))

Total Nr. Samples: 11855
Total Nr. Scores : 239232

Train  : 8544
Dev    : 1101
Test   : 2210


In [5]:
# built two lists with sentences and labels
x_train_data = [sentences[x] for x in train]
y_train_data = [scores[x] for x in train]

x_dev_data = [sentences[x] for x in dev]
y_dev_data = [scores[x] for x in dev]

x_test_data = [sentences[x] for x in test]
y_test_data = [scores[x] for x in test]

In [6]:
# convert list of tokens/words to indexes
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train_data)
sequences_train = tokenizer.texts_to_sequences(x_train_data)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 15337 unique tokens.


In [7]:
# get the max sentence lenght, needed for padding
max_input_lenght = max([len(x) for x in sequences_train])
max_input_lenght

49

In [8]:
# pad all the sequences of indexes to the 'max_input_lenght'
x_train_data_padded = pad_sequences(sequences_train, maxlen=max_input_lenght, padding='post', truncating='post')
x_dev_data_padded = pad_sequences(tokenizer.texts_to_sequences(x_dev_data), maxlen=max_input_lenght, padding='post', truncating='post')
x_test_data_padded = pad_sequences(tokenizer.texts_to_sequences(x_test_data), maxlen=max_input_lenght, padding='post', truncating='post')

## Transform scores to classes as re-labeled by Socher et al. (2013)
- 0 - 2.0    : very negative
- 2.0 - 4.0  : negative
- 4.0 - 6.0  : neutral
- 6.0 - 8.0  : negative
- 8.0 - 10.0 : very positive

In [9]:
def convert_to_categories(y_data):
    y_categories = []
    for score in y_data:
        if 0.0<=float(score)<0.2:
            y_categories.append('very_negative')
        elif 0.2<=float(score)<0.4:
            y_categories.append('negative')
        elif 0.4<=float(score)<0.6:
            y_categories.append('neutral')
        elif 0.6<=float(score)<0.8:
            y_categories.append('positive')
        elif 0.8<=float(score)<=1.0:
            y_categories.append('very positive')
            
    return y_categories

In [10]:
# Convert from scores to categories
y_train_data_categ = convert_to_categories(y_train_data)
y_dev_data_categ = convert_to_categories(y_dev_data)
y_test_data_categ = convert_to_categories(y_test_data)

# Encode the labels, each must be a vector with dim = num. of possible labels
le = LabelEncoder()
le.fit(y_train_data_categ)

labels_encoded_train = le.transform(y_train_data_categ)
labels_encoded_dev = le.transform(y_dev_data_categ)
labels_encoded_test = le.transform(y_test_data_categ)

categorical_labels_train = to_categorical(labels_encoded_train, num_classes=None)
categorical_labels_dev = to_categorical(labels_encoded_dev, num_classes=None)
categorical_labels_test = to_categorical(labels_encoded_test, num_classes=None)

In [11]:
print(x_train_data_padded.shape)
print(categorical_labels_train.shape)

(8544, 49)
(8544, 5)


In [12]:
print(x_dev_data_padded.shape)
print(categorical_labels_dev.shape)

(1101, 49)
(1101, 5)


In [13]:
print(x_test_data_padded.shape)
print(categorical_labels_test.shape)

(2210, 49)
(2210, 5)


In [14]:
from convnets_utils import *

# CNN with random word embeddings

In [15]:
model_1 = get_cnn_rand(200, len(word_index)+1, max_input_lenght, 5, loss='categorical_crossentropy')

In [None]:
history = model_1.fit(x=x_train_data_padded, y=categorical_labels_train,
                      validation_data=(x_dev_data_padded, categorical_labels_dev),
                      batch_size=50, epochs=15)

Train on 8544 samples, validate on 1101 samples
Epoch 1/15

In [None]:
loss, accuracy = model_1.evaluate(x_test_data_padded, categorical_labels_test, verbose=0)
accuracy

In [None]:
raw_predictions = model_1.predict(x_test_data_padded)
class_predictions = [np.argmax(x) for x in raw_predictions]
print(classification_report(y_test_data_categ, le.inverse_transform(class_predictions)))

# CNN with pre-trained static word embeddings

In [35]:
embeddings_index = load_fasttext_embeddings()
embeddings_matrix = create_embeddings_matrix(embeddings_index, word_index, 100)

embedding_layer_static = get_embeddings_layer(embeddings_matrix, 
                                              'embedding_layer_static', 
                                              max_input_lenght, 
                                              trainable=False)

model_2 = get_cnn_pre_trained_embeddings(embedding_layer_static, 
                                         max_input_lenght, 
                                         5, 
                                         loss='categorical_crossentropy')

Loaded 400000 word vectors.
Matrix shape: (15338, 100)


In [36]:
history = model_2.fit(x=x_train_data_padded, 
                      validation_data=(x_dev_data_padded, categorical_labels_dev),
                      y=categorical_labels_train, batch_size=16, epochs=5)

Train on 8544 samples, validate on 1101 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [37]:
loss, accuracy = model_2.evaluate(x_test_data_padded, categorical_labels_test, verbose=0)
accuracy

0.4941176474094391

In [38]:
raw_predictions = model_2.predict(x_test_data_padded)
class_predictions = [np.argmax(x) for x in raw_predictions]
print(classification_report(y_test_data_categ, le.inverse_transform(class_predictions)))

               precision    recall  f1-score   support

     negative       0.23      0.02      0.03       405
      neutral       0.52      0.92      0.66      1155
     positive       0.20      0.05      0.08       424
very positive       0.07      0.01      0.02       112
very_negative       0.10      0.02      0.03       114

     accuracy                           0.49      2210
    macro avg       0.22      0.20      0.16      2210
 weighted avg       0.36      0.49      0.37      2210



# CNN with pre-trained dynamic word embeddings

In [34]:
embedding_layer_dynamic = get_embeddings_layer(embeddings_matrix, 'embedding_layer_dynamic', 
                                               max_input_lenght, trainable=True)

model_3 = get_cnn_pre_trained_embeddings(embedding_layer_dynamic, max_input_lenght, 5, 
                                         loss='categorical_crossentropy')

In [35]:
history = model_3.fit(x=x_train_data_padded, y=categorical_labels_train, batch_size=50, epochs=5)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [36]:
loss, accuracy = model_3.evaluate(x_test_data_padded, categorical_labels_test, verbose=0)
accuracy

0.483257919549942

In [37]:
raw_predictions = model_3.predict(x_test_data_padded)
class_predictions = [np.argmax(x) for x in raw_predictions]
print(classification_report(y_test_data_categ, le.inverse_transform(class_predictions)))

               precision    recall  f1-score   support

     negative       0.20      0.07      0.10       405
      neutral       0.52      0.88      0.65      1155
     positive       0.23      0.06      0.10       424
very positive       0.00      0.00      0.00       112
very_negative       0.00      0.00      0.00       114

     accuracy                           0.48      2210
    macro avg       0.19      0.20      0.17      2210
 weighted avg       0.35      0.48      0.38      2210



# CNN multichanell with pre-trained dynamic and static word embeddings

In [21]:
model_4 = get_cnn_multichannel(embedding_layer_static, embedding_layer_dynamic, max_input_lenght, 5,
                               loss='categorical_crossentropy')

NameError: name 'embedding_layer_static' is not defined

In [57]:
history = model_4.fit(x=[x_train_data_padded,x_train_data_padded], y=categorical_labels_train, 
                      batch_size=50, 
                      epochs=5,
                      validation_split=0.33)

Train on 5724 samples, validate on 2820 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [58]:
loss, accuracy = model_4.evaluate(x=[x_test_data_padded,x_test_data_padded], y=categorical_labels_test, verbose=0)
accuracy

0.7857013592353234

In [59]:
raw_predictions = model_4.predict(x=[x_test_data_padded,x_test_data_padded])
class_predictions = [np.argmax(x) for x in raw_predictions]
print(classification_report(y_test_data_categ, le.inverse_transform(class_predictions)))

               precision    recall  f1-score   support

     negative       0.17      0.13      0.15       405
      neutral       0.52      0.71      0.60      1155
     positive       0.21      0.14      0.17       424
very positive       0.00      0.00      0.00       112
very_negative       0.00      0.00      0.00       114

    micro avg       0.42      0.42      0.42      2210
    macro avg       0.18      0.20      0.18      2210
 weighted avg       0.34      0.42      0.37      2210

