In [53]:
import re
import os
import random
import numpy as np

from collections import namedtuple

# Stanford Sentiment Treebank - movie reviews with fine-grained labels

In [202]:
# Stanford Sentiment Treebank - movie reviews with fine-grained labels
# https://nlp.stanford.edu/sentiment/

ST_sentence = namedtuple("Stanford_Sentiment", "id sentence")
ST_score = namedtuple("Stanford_Sentiment", "id score")

sentences = dict()
scores = dict()
train = []
dev = []
test = []

for filename in ['datasetSentences.txt','datasetSplit.txt', 'sentiment_labels.txt']:
    with open("ST/"+filename,'r') as f_input:
        for line in f_input:
            
            # skip headers
            if line.startswith("sentence_index") or line.startswith('phrase id'):
                continue
            
            # load sentences
            if filename=='datasetSentences.txt':                                
                sent_id, sentence = line.split('\t', 1)
                sentences[sent_id] = sentence.strip()
            
            # load splits
            if filename=='datasetSplit.txt':
                sent_id, split = line.split(',', 1)
                split = int(split.strip())
                if split == 1:                    
                    train.append(sent_id)
                if split == 2:                                        
                    test.append(sent_id)
                if split == 3:
                    dev.append(sent_id)
            
            # sentences_id        
            if filename=='sentiment_labels.txt':                    
                sent_id, sent_score = line.split('|', 1)
                #sent_score = float(sent_score.strip())
                sample = ST_score(sent_id, float(sent_score.strip()))
                scores[sent_id] = sent_score.strip()

# Samples and Classes/Labels

In [203]:
print("Total Nr. Samples: {}".format(len(sentences)))
print("Total Nr. Scores : {}".format(len(scores)))
print()
print("Train  : {}".format(len(train)))
print("Dev    : {}".format(len(dev)))
print("Test   : {}".format(len(test)))

Total Nr. Samples: 11855
Total Nr. Scores : 239232

Train  : 8544
Dev    : 1101
Test   : 2210


In [204]:
# built two lists with sentences and labels
x_train_data = [sentences[x] for x in train]
y_train_data = [scores[x] for x in train]

x_dev_data = [sentences[x] for x in dev]
y_dev_data = [scores[x] for x in dev]

x_test_data = [sentences[x] for x in test]
y_test_data = [scores[x] for x in test]

In [205]:
# convert list of tokens/words to indexes
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train_data)
sequences_train = tokenizer.texts_to_sequences(x_train_data)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 15337 unique tokens.


In [206]:
# get the max sentence lenght, needed for padding
max_input_lenght = max([len(x) for x in sequences_train])
max_input_lenght

49

In [207]:
# pad all the sequences of indexes to the 'max_input_lenght'
x_train_data_padded = pad_sequences(sequences_train, maxlen=max_input_lenght, padding='post', truncating='post')
x_dev_data_padded = pad_sequences(tokenizer.texts_to_sequences(x_dev_data), maxlen=max_input_lenght, padding='post', truncating='post')
x_test_data_padded = pad_sequences(tokenizer.texts_to_sequences(x_test_data), maxlen=max_input_lenght, padding='post', truncating='post')

## Transform scores to classes as re-labeled by Socher et al. (2013)
- 0 - 2.0    : very negative
- 2.0 - 4.0  : negative
- 4.0 - 6.0  : neutral
- 6.0 - 8.0  : negative
- 8.0 - 10.0 : very positive

In [208]:
def convert_to_categories(y_data):
    y_categories = []
    for score in y_data:
        if 0.0<=float(score)<0.2:
            y_categories.append('very_negative')
        elif 0.2<=float(score)<0.4:
            y_categories.append('negative')
        elif 0.4<=float(score)<0.6:
            y_categories.append('neutral')
        elif 0.6<=float(score)<0.8:
            y_categories.append('positive')
        elif 0.8<=float(score)<=1.0:
            y_categories.append('very positive')
            
    return y_categories

In [209]:
# Convert from scores to categories
y_train_data_categ = convert_to_categories(y_train_data)
y_dev_data_categ = convert_to_categories(y_dev_data)
y_test_data_categ = convert_to_categories(y_test_data)

# Encode the labels, each must be a vector with dim = num. of possible labels
le = LabelEncoder()
le.fit(y_train_data_categ)

labels_encoded_train = le.transform(y_train_data_categ)
labels_encoded_dev = le.transform(y_dev_data_categ)
labels_encoded_test = le.transform(y_test_data_categ)

categorical_labels_train = to_categorical(labels_encoded_train, num_classes=None)
categorical_labels_dev = to_categorical(labels_encoded_dev, num_classes=None)
categorical_labels_test = to_categorical(labels_encoded_test, num_classes=None)

In [210]:
print(x_train_data_padded.shape)
print(categorical_labels_train.shape)

(8544, 49)
(8544, 5)


In [211]:
print(x_dev_data_padded.shape)
print(categorical_labels_dev.shape)

(1101, 49)
(1101, 5)


In [212]:
print(x_test_data_padded.shape)
print(categorical_labels_test.shape)

(2210, 49)
(2210, 5)
