In [85]:
import keras
keras.__version__

'2.1.2'

In [86]:
import os

home_dir = os.getcwd()
fname = os.path.join(home_dir,'data/train_data.csv')

f = open(fname)
data = f.read()
f.close()

lines = data.split('\n')
header = lines[0].split(',')
lines = lines[1:]
lines = lines[:-1]

print(header)
print(len(lines))

['sentiment', 'text']
5250


### Exploring the dataset


In [87]:
sents = [0,0,0,0,0]
labels = []
texts = []
for line in lines:
    sent = int(line.split(',')[0])
    tweet = line.split(',')[1]
    sents[sent -1] +=1
    texts.append(tweet)
    labels.append(sent-1) #data is labeled 1-5, so we shift by 1 so it starts with 0
    
print(sents)
print(len(labels))
print(len(texts))

print("A sample tweet: " + texts[6] )
print("Has sentiment: " + str(labels[6]))

[76, 473, 3287, 1062, 352]
5250
5250
A sample tweet: Autonomous vehicles could reduce traffic fatalities by 90%...I'm in!
Has sentiment: 4


### Tokenize the data

We first vectorize the data we collected and prepare a training and validation split.

In [88]:
from keras.preprocessing.text import Tokenizer
import numpy as np

maxlen = 20 # We cut the tweet after 20 words (most are shorter than this anyway)
training_samples = 4000
validation_samples = len(labels) - training_samples
max_words = 10000

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

x_train = sequences[:training_samples]
y_train = labels[:training_samples]

x_val = sequences[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]
y_train[0]

Found 10982 unique tokens.


4

### Vectorize the data


In [89]:
import numpy as np

def vectorize_sequences(sequences, dimension=10000):
    # Create an all-zero matrix of shape (len(sequences), dimension)
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1  # set specific indices of results[i] to 1s
    return results


# Our vectorized training data
x_train = vectorize_sequences(x_train)
# Our vectorized test data
x_val = vectorize_sequences(x_val)
        

In [90]:
x_train[0]

array([ 0.,  0.,  1., ...,  0.,  0.,  0.])

We one-hot encode the lables using a Keras convenience function

In [91]:
from keras.utils.np_utils import to_categorical

y_train = to_categorical(y_train,)
y_val = to_categorical(y_val,)

In [96]:
from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(32, activation='relu', input_shape=(10000,)))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(5, activation='softmax')) 

In [97]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_10 (Dense)             (None, 32)                320032    
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 16)                528       
_________________________________________________________________
dense_12 (Dense)             (None, 5)                 85        
Total params: 320,645
Trainable params: 320,645
Non-trainable params: 0
_________________________________________________________________


In [98]:
model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy'])

In [99]:
history = model.fit(x_train,y_train,epochs = 30, batch_size=128,validation_data=(x_val,y_val))

Train on 4000 samples, validate on 1250 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [None]:
import matplotlib.pyplot as plt

loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)

acc = history.history['acc']
val_acc = history.history['val_acc']

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()