## Twitter Sentiment Analysis with Gensim Word2Vec and Keras Convolutional Network
Giuseppe Bonaccorso (https://www.bonaccorso.eu)

In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import dill
import keras.backend as K
import multiprocessing
import tensorflow as tf

from gensim.models.word2vec import Word2Vec

from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Flatten
from keras.layers.convolutional import Conv1D
from keras.optimizers import Adam

from nltk.stem.lancaster import LancasterStemmer
from nltk.tokenize import RegexpTokenizer

Using TensorFlow backend.


In [3]:
# Set random seed (for reproducibility)
np.random.seed(1000)

In [4]:
# Select whether using Keras with or without GPU support
# See: https://stackoverflow.com/questions/40690598/can-keras-with-tensorflow-backend-be-forced-to-use-cpu-or-gpu-at-will
use_gpu = True

config = tf.ConfigProto(intra_op_parallelism_threads=multiprocessing.cpu_count(), 
                        inter_op_parallelism_threads=multiprocessing.cpu_count(), 
                        allow_soft_placement=True, 
                        device_count = {'CPU' : 1, 
                                        'GPU' : 1 if use_gpu else 0})

session = tf.Session(config=config)
K.set_session(session)

Download the dataset from: http://thinknook.com/wp-content/uploads/2012/09/Sentiment-Analysis-Dataset.zip

In [5]:
dataset_location = './dataset.csv'
model_location = './model/'

### Parse tweet corpus and sentiments

In [6]:
corpus = []
labels = []

In [8]:
with open(dataset_location, 'r', encoding='utf-8') as df:
    for i, line in enumerate(df):
        if i == 0:
            # Skip the header
            continue

        parts = line.strip().split(',')
        
        # Sentiment (0 = Negative, 1 = Positive)
        labels.append(int(parts[1].strip()))
        
        # Tweet
        tweet = parts[3].strip()
        if tweet.startswith('"'):
            tweet = tweet[1:]
        if tweet.endswith('"'):
            tweet = tweet[::-1]
        
        corpus.append(tweet.strip().lower())
        
print('Corpus size: {}'.format(len(corpus)))

Corpus size: 1578627


### Tokenize and remove stopwords

In [9]:
tkr = RegexpTokenizer('[a-zA-Z0-9@]+')
stemmer = LancasterStemmer()

In [10]:
tokenized_corpus = []

for i, tweet in enumerate(corpus):
    tokens = [stemmer.stem(t) for t in tkr.tokenize(tweet) if not t.startswith('@')]
    tokenized_corpus.append(tokens)

#### Save tokenized corpus

In [None]:
with open(model_location + 'tokenized_corpus.dill', 'wb') as f:
    dill.dump(tokenized_corpus, f)

#### Load tokenized corpus

In [None]:
with open(model_location + 'tokenized_corpus.dill', 'rb') as f:
    tokenized_corpus = dill.load(f)

### Gensim Word2Vec model

In [11]:
vector_size = 512
window_size = 10

In [12]:
# Create Word2Vec
word2vec = Word2Vec(sentences=tokenized_corpus,
                    size=vector_size, 
                    window=window_size, 
                    negative=20,
                    iter=50,
                    seed=1000,
                    workers=multiprocessing.cpu_count())

#### Save Word2Vec model

In [None]:
word2vec.save(model_location + 'word2vec.model')

#### Load Word2Vec model

In [None]:
word2vec = Word2Vec.load(model_location + 'word2vec.model')

### Copy word vectors and delete Word2Vec model  and original corpus to save memory

In [13]:
X_vecs = word2vec.wv

del word2vec
del corpus

#### Train subset size (0 < size < len(tokenized_corpus))

In [14]:
train_size = 1000000

#### Test subset size (0 < size < len(tokenized_corpus) - train_size)

In [15]:
test_size = 100000

#### Compute average and max tweet length

In [16]:
avg_length = 0.0
max_length = 0

for tweet in tokenized_corpus:
    if len(tweet) > max_length:
        max_length = len(tweet)
    avg_length += float(len(tweet))
    
print('Average tweet length: {}'.format(avg_length / float(len(tokenized_corpus))))
print('Max tweet length: {}'.format(max_length))

Average tweet length: 11.0288776259
Max tweet length: 53


#### Tweet max length (number of tokens)

In [17]:
max_tweet_length = 15

### Create train and test sets

In [18]:
# Generate random indexes
indexes = np.random.choice(len(tokenized_corpus), train_size + test_size, replace=False)

X_train = np.zeros((train_size, max_tweet_length, vector_size), dtype=K.floatx())
Y_train = np.zeros((train_size, 2), dtype=np.int32)
X_test = np.zeros((test_size, max_tweet_length, vector_size), dtype=K.floatx())
Y_test = np.zeros((test_size, 2), dtype=np.int32)

for i, index in enumerate(indexes):
    for t, token in enumerate(tokenized_corpus[index]):
        if t >= max_tweet_length:
            break
        
        if token not in X_vecs:
            continue
    
        if i < train_size:
            X_train[i, t, :] = X_vecs[token]
        else:
            X_test[i - train_size, t, :] = X_vecs[token]
            
    if i < train_size:
        Y_train[i, :] = [1.0, 0.0] if labels[index] == 0 else [0.0, 1.0]
    else:
        Y_test[i - train_size, :] = [1.0, 0.0] if labels[index] == 0 else [0.0, 1.0]

### Keras Convolutional model

In [19]:
batch_size = 32
nb_epochs = 100

In [20]:
model = Sequential()

model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same', input_shape=(max_tweet_length, vector_size)))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Dropout(0.25))

model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Dropout(0.25))

model.add(Flatten())

model.add(Dense(256, activation='tanh'))
model.add(Dense(256, activation='tanh'))
model.add(Dropout(0.5))

model.add(Dense(2, activation='softmax'))

In [21]:
# Compile the model
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(lr=0.0001, decay=1e-6),
              metrics=['accuracy'])

In [25]:
# Fit the model
model.fit(X_train, Y_train,
          batch_size=batch_size,
          shuffle=True,
          epochs=nb_epochs,
          validation_data=(X_test, Y_test),
          callbacks=[EarlyStopping(min_delta=0.00025, patience=2)])

Train on 1000000 samples, validate on 100000 samples
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100


<keras.callbacks.History at 0x1207dc6d0>