In [1]:
import re
import os
import codecs
import numpy as np
import pandas as pd
from keras.layers import *
from keras.models import Model
from keras.optimizers import Adam
from keras_bert import load_trained_model_from_checkpoint, Tokenizer
from keras import backend as K

Using TensorFlow backend.


## Data

We use the "Sentiment140" data as our sample. It contains 1,600,000 tweets extracted using the twitter api. The tweets have been annotated (0 = negative, 2 = neutral, 4 = positive) and they can be used to detect sentiment.

In [2]:
max_len = 100
batch_size = 32
drop_out_rate = 0.5
learning_rate = 1e-5
epochs = 5

#Path for Google Research pre-trained model
pretrained_path = 'uncased_L-4_H-512_A-8'
config_path = os.path.join(pretrained_path, 'bert_config.json')
checkpoint_path = os.path.join(pretrained_path, 'bert_model.ckpt')
dict_path = os.path.join(pretrained_path, 'vocab.txt')

In [3]:
#Load "Sentiment140" file in a DataFrame.
sentiment140 = 'sentiment140.csv'
df = pd.read_csv('sentiment140.csv', header=None, 
                  names=['polarity', 'ids', 'date', 'flag', 'user', 'text'],
                  encoding='latin-1')

def clean_text(sent): 
    '''
    - Remove unnecessary punctuations
    - Remove usernames after "@"
    - Remove extra spaces
    '''
    sent = re.sub(r' +', ' ', sent)
    sent = re.sub(r'(\@\w*\s)', '', sent)
    sent = re.sub(r' ?\W', ' ', sent)
    sent = re.sub(r' +', ' ', sent)
    return sent

#Clean text extracted from twitter
cleaned_text = df['text'].apply(clean_text)
df['cleaned_text'] = cleaned_text
cleaned = df[['cleaned_text', 'polarity']]

#Review negative and positive tweets from loaded file
neg = cleaned[cleaned['polarity']==0]
pos = cleaned[cleaned['polarity']==4]

In [4]:
neg.head()

Unnamed: 0,cleaned_text,polarity
0,http twitpic com 2y1zl Awww that s a bummer Yo...,0
1,is upset that he can t update his Facebook by ...,0
2,I dived many times for the ball Managed to sav...,0
3,my whole body feels itchy and like its on fire,0
4,no it s not behaving at all i m mad why am i h...,0


In [5]:
pos.head()

Unnamed: 0,cleaned_text,polarity
800000,I LOVE u guys r the best,4
800001,im meeting up with one of my besties tonight C...,4
800002,Thanks for the Twitter add Sunisa I got to mee...,4
800003,Being sick can be really cheap when it hurts t...,4
800004,he has that effect on everyone,4


In [6]:
#Merge negative and positive samples
data = []

for sent in neg['cleaned_text']:
    data.append((sent, 0))
    
for sent in pos['cleaned_text']:
    data.append((sent, 4))

#Shuffle dataset
random_order = list(range(len(data)))
np.random.shuffle(random_order)

#Split training and testing dataset in 9:1
train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0]
test_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0]

## Tokenize

Tokenize using Google Research dictionary.

In [7]:
token_dict = {}

with codecs.open(dict_path, 'r', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)
    
class OurTokenizer(Tokenizer):
    def _tokenize(self, text):
        '''
        Redefine Tokenizer to keep the same length as the orginal text list
        '''
        R = []
        for c in text:
            if c in self._token_dict:
                R.append(c)
            elif self._is_space(c):
                R.append('[unused1]')
            else:
                R.append('[UNK]')
        return R

tokenizer = OurTokenizer(token_dict)

In [8]:
def seq_padding(X, padding=0):
    '''
    To keep each text input at the same length and replace the remaining positions with 0
    '''
    L = [len(x) for x in X]
    ML = max(L)
    return np.array([np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X])

class data_generator:
    def __init__(self, data, batch_size=batch_size):
        self.data = data
        self.batch_size = batch_size
        self.steps = len(self.data) // self.batch_size
        if len(self.data) % self.batch_size != 0:
            self.steps += 1
            
    def __len__(self):
        return self.steps
    
    def __iter__(self):
        while True:
            idxs = list(range(len(self.data)))
            np.random.shuffle(idxs)
            X1, X2, Y = [], [], []
            for i in idxs:
                d = self.data[i]
                text = d[0][:max_len]
                x1, x2 = tokenizer.encode(first=text)
                y = d[1]
                X1.append(x1)
                X2.append(x2)
                Y.append([y])
                if len(X1) == self.batch_size or i == idxs[-1]:
                    X1 = seq_padding(X1)
                    X2 = seq_padding(X2)
                    Y = seq_padding(Y)
                    yield [X1, X2], Y
                    [X1, X2, Y] = [], [], []
                    
actual = data_generator(train_data)
valid = data_generator(test_data)

#Load pre-trained model
bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None, trainable=True)

x1_in = Input(shape=(None,))
x2_in = Input(shape=(None,))
 
x = bert_model([x1_in, x2_in])
x = Lambda(lambda x: x[:, 0])(x)
x = Dropout(drop_out_rate)(x)
p = Dense(1, activation='sigmoid')(x)
 
model = Model([x1_in, x2_in], p)
model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(learning_rate),
    metrics=['accuracy']
)
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
model_2 (Model)                 (None, None, 512)    28500992    input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, 512)          0           model_2[1][0]              

In [None]:
model.fit_generator(
    actual.__iter__(),
    steps_per_epoch=len(actual),
    epochs=epochs,
    validation_data=valid.__iter__(),
    validation_steps=len(valid)
)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/5
