In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
import numpy as np
import torch
import transformers
import inspect
import time
import logging

from tqdm import trange, tqdm, tqdm_notebook, tqdm_pandas, tqdm_gui
from datetime import datetime
from tqdm import tqdm
from transformers import BertConfig, BertModel, BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_constant_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB 
from sklearn.model_selection import cross_val_score


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


'GeForce RTX 2080 Ti'

# Load data & pre-processing

In [2]:
def preprocessing(df):
    """
    Preprocessing step
    As above dataframe heads, there is a lot of <br /> character 
    """
    df.sentence = df.sentence.str.replace('<br />','')
    df.sentence = df.sentence.str.lower()
    return df

train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

train = preprocessing(train)
test = preprocessing(test)

print('Train data:\n{}\n\nTest data:\n{}'.format(train.head(5), test.head(5)))

Train data:
                                            sentence  sentiment  polarity
0  this is a very bland and inert production of o...          2         0
1  i've seen this film in avant-premiere at imagi...          7         1
2  revolt of the zombies (2 outta 5 stars) no, th...          4         0
3  may contain minor spoilers.dressed to kill, ha...          7         1
4  (spoilers)i shoulda figured. the dvd didn't ev...          2         0

Test data:
                                            sentence  sentiment  polarity
0  i loved this movie so much. i'm a big fan of a...         10         1
1  the stark, cold landscape of big sky country, ...          9         1
2  this cheapo exploitation flick is some genuine...          2         0
3  this movie has been promoting in everywhere in...          1         0
4  this is a great off-the-wall romantic comedy a...          8         1


In [5]:
import keras
from keras.models import Sequential, Model, load_model
from keras.layers import Input, Embedding, Dense, Dropout, CuDNNGRU, CuDNNLSTM, Bidirectional
from keras.layers import Conv1D, Flatten, Activation, BatchNormalization, MaxPooling1D
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils import to_categorical, Sequence


embedded_inputs = Input(shape=(512,768,), 
                        dtype='float32')
conv1 = Conv1D(filters=64, 
               kernel_size=3, 
               activation='relu',
               name='CNN1'
              )(embedded_inputs)
pooling1 = MaxPooling1D(3, name='Pool1')(conv1)
Flat1 = Flatten(name='Flatten1')(pooling1)
Dense1 = Dense(1024, activation='relu', name='Dense1')(Flat1)
main_outputs = Dense(2, activation='sigmoid', name='output')(Dense1)

"""
model = Model(inputs = embedded_inputs, outputs=main_outputs)
model.compile(loss='binary_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])
"""

model = load_model('bert_cnn64-3.model')

model.summary()



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 512, 768)          0         
_________________________________________________________________
CNN1 (Conv1D)                (None, 510, 64)           147520    
_________________________________________________________________
Pool1 (MaxPooling1D)         (None, 170, 64)           0         
_________________________________________________________________
Flatten1 (Flatten)           (None, 10880)             0         
_________________________________________________________________
Dense1 (Dense)               (None, 1024)              11142144  
_________________________________________________________________
output (Dense)               (None, 2)                 2050      
Total params: 11,291,714
Trai

In [3]:
class BertEmbedding():
    def __init__(self, load_model=None, load_config=None, model='bert-base-uncased', max_len=512, batch_size=6):
        self.pre_trained_model = model
        self.max_len = max_len
        self.batch_size = batch_size
        self.model = BertModel.from_pretrained(self.pre_trained_model)
        self.tokenizer = BertTokenizer.from_pretrained(self.pre_trained_model)
        #self.optimizer = AdamW(params = self.model.parameters(), lr=1e-5)
    
    def create_ids(self, sentences):
        logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR) #Disable tokenizer logs, it's really annoy
        input_ids = []
        for sen in tqdm_notebook(sentences, desc="Create Ids"):
            tmp = self.tokenizer.encode(sen)
            input_ids.append(tmp)
        input_ids = pad_sequences(input_ids, 
                                  maxlen=self.max_len, 
                                  dtype='int64', 
                                  truncating='post', 
                                  padding='post')
        return input_ids
    
    def generate(self, inputs):
        test_ids = self.create_ids(inputs)
        test_dataloader = DataLoader(torch.tensor(test_ids), batch_size=self.batch_size)
        
        embedding = []
        self.model.to(device)
        self.model.eval()
        for input_ids in tqdm_notebook(test_dataloader, desc="Generating"):
            with torch.no_grad():
                last_state = self.model(input_ids.to(device))[0]
            last_state = last_state.detach().cpu().numpy()
            embedding.extend(last_state)
        return embedding

bert_embedding = BertEmbedding(batch_size=6)

In [8]:
embedding = bert_embedding.generate(train.sentence)

HBox(children=(IntProgress(value=0, description='Create Ids', max=25000, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Generating', max=250, style=ProgressStyle(description_width='…




In [4]:
test_embedding = bert_embedding.generate(test.sentence.values)

HBox(children=(IntProgress(value=0, description='Create Ids', max=25000, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Generating', max=4167, style=ProgressStyle(description_width=…




In [6]:
class BonzSequence(Sequence):
    def __init__(self, x_set, y_set, batch_size):
        self.x, self.y = x_set, y_set
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))

    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]

        return np.array(batch_x), np.array(batch_y)
    
#training_generator = BonzSequence(embedding, to_categorical(train.polarity.values), batch_size=1000)
test_generator = BonzSequence(test_embedding, to_categorical(test.polarity.values), batch_size=1000)

In [33]:
model.fit_generator(
    generator=training_generator, 
    epochs=20,
    verbose=2
)

# model.save('bert_cnn64-3.model')

Epoch 1/20
 - 20s - loss: 0.1386 - accuracy: 0.9503
Epoch 2/20
 - 20s - loss: 0.1125 - accuracy: 0.9618
Epoch 3/20
 - 20s - loss: 0.0822 - accuracy: 0.9769
Epoch 4/20
 - 20s - loss: 0.0606 - accuracy: 0.9853
Epoch 5/20
 - 20s - loss: 0.0494 - accuracy: 0.9887
Epoch 6/20
 - 20s - loss: 0.0332 - accuracy: 0.9951
Epoch 7/20
 - 19s - loss: 0.0207 - accuracy: 0.9978
Epoch 8/20
 - 20s - loss: 0.0144 - accuracy: 0.9991
Epoch 9/20
 - 20s - loss: 0.0105 - accuracy: 0.9994
Epoch 10/20
 - 20s - loss: 0.0068 - accuracy: 0.9996
Epoch 11/20
 - 20s - loss: 0.0050 - accuracy: 0.9998
Epoch 12/20
 - 20s - loss: 0.0038 - accuracy: 0.9999
Epoch 13/20
 - 20s - loss: 0.0030 - accuracy: 1.0000
Epoch 14/20
 - 20s - loss: 0.0023 - accuracy: 1.0000
Epoch 15/20
 - 20s - loss: 0.0019 - accuracy: 1.0000
Epoch 16/20
 - 20s - loss: 0.0016 - accuracy: 1.0000
Epoch 17/20
 - 20s - loss: 0.0013 - accuracy: 1.0000
Epoch 18/20
 - 20s - loss: 0.0012 - accuracy: 1.0000
Epoch 19/20
 - 19s - loss: 0.0010 - accuracy: 1.0000
Ep

<keras.callbacks.callbacks.History at 0x2562f400860>

In [8]:
y = model.predict_generator(
    generator=test_generator,
    verbose=1
)



In [15]:
pred = np.argmax(y, axis=1)
print(classification_report(pred, test.polarity.values))

              precision    recall  f1-score   support

           0       0.86      0.86      0.86     12502
           1       0.86      0.86      0.86     12498

   micro avg       0.86      0.86      0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000



In [22]:
import sys
def sizeof_fmt(num, suffix='B'):
    ''' by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name, value in locals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

                         train: 33.0 MiB
                          test: 32.9 MiB
                           _14:  1.0 MiB
                test_embedding: 200.0 KiB
                            _7: 195.4 KiB
                             y: 195.4 KiB
                           _17: 195.4 KiB
                          pred: 195.4 KiB
                           _10: 195.4 KiB
                           _16: 195.4 KiB


# TF-IDF

In [10]:
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(train.sentence)

def NB_model():
    model = MultinomialNB()
    rs = cross_val_score(model, 
                         tfidf,
                         train.polarity,
                         cv=10,
                         scoring='f1_macro'
                        )
    return rs
    
NB_rs = NB_model()

from sklearn.neighbors import KNeighborsClassifier

def KNN_model():
    model = KNeighborsClassifier()
    rs = cross_val_score(
        model,
        tfidf,
        train.polarity,
        cv=10,
        scoring='f1_macro'
    )
    return rs

KNN_rs = KNN_model()

ML_rs = {
    'NB': NB_rs,
    'KNN': KNN_rs
}

ML_rs

{'NB': array([0.8503249 , 0.86712346, 0.86394985, 0.86839071, 0.87356083,
        0.86196287, 0.86422793, 0.86069694, 0.86586606, 0.86996503]),
 'KNN': array([0.77115767, 0.78928929, 0.7716737 , 0.76163096, 0.78165532,
        0.76871059, 0.79583065, 0.77128274, 0.78892033, 0.77502598])}