In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
import numpy as np
import torch
import transformers
import inspect
import time
import logging

from tqdm import trange, tqdm, tqdm_notebook, tqdm_pandas, tqdm_gui
from datetime import datetime
from tqdm import tqdm
from transformers import BertConfig, BertModel, BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_constant_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB 
from sklearn.model_selection import cross_val_score

from sys import getsizeof as gs

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


'GeForce RTX 2080 Ti'

# Load data & pre-processing

In [2]:
def preprocessing(df):
    """
    Preprocessing step
    As above dataframe heads, there is a lot of <br /> character 
    """
    df.sentence = df.sentence.str.replace('<br />','')
    df.sentence = df.sentence.str.lower()
    return df

train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

train = preprocessing(train)
test = preprocessing(test)

print('Train data:\n{}\n\nTest data:\n{}'.format(train.head(5), test.head(5)))

Train data:
                                            sentence  sentiment  polarity
0  this is a very bland and inert production of o...          2         0
1  i've seen this film in avant-premiere at imagi...          7         1
2  revolt of the zombies (2 outta 5 stars) no, th...          4         0
3  may contain minor spoilers.dressed to kill, ha...          7         1
4  (spoilers)i shoulda figured. the dvd didn't ev...          2         0

Test data:
                                            sentence  sentiment  polarity
0  i loved this movie so much. i'm a big fan of a...         10         1
1  the stark, cold landscape of big sky country, ...          9         1
2  this cheapo exploitation flick is some genuine...          2         0
3  this movie has been promoting in everywhere in...          1         0
4  this is a great off-the-wall romantic comedy a...          8         1


# Create Embedding Layer from BERT

In [3]:

class BertEmbedding():
    def __init__(self, load_model=None, load_config=None, model='bert-base-uncased', max_len=512, batch_size=6):
        self.pre_trained_model = model
        self.max_len = max_len
        self.batch_size = batch_size
        self.model = BertModel.from_pretrained(self.pre_trained_model)
        self.model.to(device)
        self.tokenizer = BertTokenizer.from_pretrained(self.pre_trained_model)
    
    def create_ids(self, sentences):
        logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR) #Disable tokenizer logs, it's really annoy
        input_ids = []
        for sen in tqdm_notebook(sentences, desc='Ids'):
            tmp = self.tokenizer.encode(sen)
            input_ids.append(tmp)
        input_ids = pad_sequences(input_ids, 
                                  maxlen=self.max_len, 
                                  dtype='int64', 
                                  truncating='post', 
                                  padding='post')
        return torch.tensor(input_ids)
    
    def generate(self, inputs):
        test_ids = self.create_ids(inputs)
        test_dataloader = DataLoader(test_ids, batch_size=self.batch_size)
        embedding = []       
        for input_ids in tqdm_notebook(test_dataloader, desc='Embedding'):
            with torch.no_grad():
                last_state = self.model(input_ids.to(device))[0]
            embedding.extend(last_state.detach().cpu().numpy())
        return embedding

bert_embedding = BertEmbedding(batch_size=200)

#train_embedding = bert_embedding.generate(train.sentence.values)
    
test_embedding = bert_embedding.generate(test.sentence.values)

torch.cuda.empty_cache()



HBox(children=(IntProgress(value=0, description='Ids', max=25000, style=ProgressStyle(description_width='initi…




HBox(children=(IntProgress(value=0, description='Embedding', max=125, style=ProgressStyle(description_width='i…




# Load dataset by sequence

In [4]:
import keras
from keras.models import Sequential, Model, load_model
from keras.layers import Input, Embedding, Dense, Dropout, CuDNNGRU, CuDNNLSTM, Bidirectional, Concatenate
from keras.layers import Conv1D, Flatten, Activation, BatchNormalization, MaxPooling1D, GlobalMaxPooling1D
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils import to_categorical, Sequence
from keras.optimizers import Adam

class BonzSequence(Sequence):
    def __init__(self, x_set, y_set, batch_size):
        self.x, self.y = x_set, y_set
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))

    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]

        return np.array(batch_x), np.array(batch_y)
    
'''
training_generator = BonzSequence(train_embedding, 
                                  to_categorical(train.polarity.values), 
                                  batch_size=5)
'''

test_generator = BonzSequence(test_embedding, to_categorical(test.polarity.values), batch_size=25)


torch.cuda.empty_cache()

# Create/Load model

In [7]:
"""
sess = tf.Session()
K.set_session(sess)

import resource
class MemoryCallback(Callback):
    def on_epoch_end(self, epoch, log={}):
        print(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
"""


def build_model():
    embedded_inputs = Input(shape=(512,768,), 
                            dtype='float32')
    convs = []
    kernel_size = [2,3,4]
    for k_size in kernel_size:
        conv = Conv1D(filters=768, kernel_size=k_size)(embedded_inputs)
        max_pool = MaxPooling1D(k_size)(conv)
        batch_normal = BatchNormalization()(max_pool)
        dropout = Dropout(0.1)(batch_normal)
        convs.append(dropout)
    merge = Concatenate(axis=1)(convs)
    max_pool_extra = MaxPooling1D(128)(merge)
    flat = Flatten()(max_pool_extra)
    dense1 = Dense(768, activation='relu', name='Dense1')(flat)
    dropout2 = Dropout(0.1)(dense1)
    main_outputs = Dense(2, activation='sigmoid', name='output')(dropout2)

    model = Model(inputs = embedded_inputs, outputs=main_outputs)
    model.compile(loss='binary_crossentropy',
                 optimizer=Adam(learning_rate=1e-5),
                 metrics=['accuracy']
                 #callbacks = [MemoryCallback()]
                 )
    return model

model = build_model()

#model = load_model('bert_cnn128-3.model')

model.summary()


Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 512, 768)     0                                            
__________________________________________________________________________________________________
conv1d_4 (Conv1D)               (None, 511, 768)     1180416     input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 510, 768)     1770240     input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_6 (Conv1D)               (None, 509, 768)     2360064     input_2[0][0]                    
____________________________________________________________________________________________

# Training model

In [8]:
for i in range(10):
    filename = 'bertcnn_3cnn_f768_k234_e'+str(i)+'.model'
    model.fit_generator(
        generator=training_generator, 
        shuffle=True,
        epochs=1,
        verbose=2
    )
    model.save(filename)

Epoch 1/1
 - 129s - loss: 0.5144 - accuracy: 0.7506
Epoch 1/1
 - 129s - loss: 0.3002 - accuracy: 0.8721
Epoch 1/1
 - 130s - loss: 0.2144 - accuracy: 0.9161
Epoch 1/1
 - 129s - loss: 0.1433 - accuracy: 0.9495
Epoch 1/1
 - 129s - loss: 0.0812 - accuracy: 0.9772
Epoch 1/1
 - 130s - loss: 0.0419 - accuracy: 0.9919
Epoch 1/1
 - 130s - loss: 0.0206 - accuracy: 0.9973
Epoch 1/1
 - 130s - loss: 0.0110 - accuracy: 0.9986
Epoch 1/1
 - 131s - loss: 0.0094 - accuracy: 0.9983
Epoch 1/1
 - 131s - loss: 0.0034 - accuracy: 0.9997


In [7]:
for i in range(10):
    filename = 'bertcnn_3cnn_f768_k234_e'+str(i)+'.model'
    model = load_model(filename)
    y = model.predict_generator(
        generator=test_generator,
        verbose=0
    )
    pred = np.argmax(y, axis=1)
    print('Predict file: {}\n\n{}\n-------------------------------\n'.format(filename,
                                                                             classification_report(pred, test.polarity.values, digits=4)))



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Predict file: bertcnn_3cnn_f768_k234_e0.model

              precision    recall  f1-score   support

           0     0.9474    0.7757    0.8530     15266
           1     0.7261    0.9324    0.8164      9734

   micro avg     0.8367    0.8367    0.8367     25000
   macro avg     0.8367    0.8541    0.8347     25000
weighted avg     0.8612    0.8367    0.8387     25000

-------------------------------

Predict file: bertcnn_3cnn_f768_k234_e1.model

              precision    recall  f1-score   support

           0     0.8193    0.9334    0.8726     10972
           1     0.9415    0.8390    0.8873     14028

   micro avg     0.8804    0.8804    0.8804     25000
   macro avg     0.8804    0.8862    0.8800     25000
weighted avg     0.8879    0.8804    0.8808     25000

-------------------------------

Predict file: bertcnn_3cnn_f768_k234_e2.model

              precision    recall  f1-score

In [4]:
import tmunlp as nlp


result = nlp.get_label_term_weighting('tmunlp_file.txt', ['0','1'])
x1 =  nlp.get_keyword('0', result, 50)
x2 =  nlp.get_keyword('1', result, 50)

In [5]:
print(x1)
print(x2)

{'worst': 1706.1483176343, 'bad': 1449.2746364960954, 'waste': 1136.4784421247887, 'awful': 521.4571917637535, 'no': 521.221067047155, 'nothing': 515.4468047933894, 'stupid': 455.32281954525683, 'worse': 449.4413441540029, 'supposed': 443.6013667654656, 'poorly': 434.2027654745826, 'poor': 412.15603143573367, 'even': 401.36642708681666, 'bad.': 400.17435870085217, 'terrible': 383.329121129757, 'minutes': 367.6493424998815, 'boring': 357.52410862298154, 'horrible': 338.31785966231627, 'bad,': 338.1440528186075, 'lame': 320.45889514498947, 'awful.': 307.4845480495725, 'plot': 302.46288289098493, 'just': 291.6127316752045, 'wasted': 281.4846716715197, "don't": 278.8628030503114, 'money': 262.4749268983362, 'pointless': 262.4589064076291, 'badly': 249.60850130337656, 'script': 247.32662970117644, 'redeeming': 246.49952975680648, 'any': 241.97959852023632, 'thing': 237.84714135414515, 'movie': 226.01361227543293, 'acting': 225.076180321601, 'terrible.': 219.12721438235317, 'crap': 214.73818

In [3]:
new_df = train[['polarity', 'sentence']]
new_df.to_csv('tmunlp_file.txt', sep='\t', header=None, index=False)

# TF-IDF

In [10]:
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(train.sentence)

def NB_model():
    model = MultinomialNB()
    rs = cross_val_score(model, 
                         tfidf,
                         train.polarity,
                         cv=10,
                         scoring='f1_macro'
                        )
    return rs
    
NB_rs = NB_model()

from sklearn.neighbors import KNeighborsClassifier

def KNN_model():
    model = KNeighborsClassifier()
    rs = cross_val_score(
        model,
        tfidf,
        train.polarity,
        cv=10,
        scoring='f1_macro'
    )
    return rs

KNN_rs = KNN_model()

ML_rs = {
    'NB': NB_rs,
    'KNN': KNN_rs
}

ML_rs

{'NB': array([0.8503249 , 0.86712346, 0.86394985, 0.86839071, 0.87356083,
        0.86196287, 0.86422793, 0.86069694, 0.86586606, 0.86996503]),
 'KNN': array([0.77115767, 0.78928929, 0.7716737 , 0.76163096, 0.78165532,
        0.76871059, 0.79583065, 0.77128274, 0.78892033, 0.77502598])}