In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
import numpy as np
import torch
import transformers
import inspect
import time
import logging

from tqdm import trange, tqdm, tqdm_notebook, tqdm_pandas, tqdm_gui
from datetime import datetime
from tqdm import tqdm
from transformers import BertConfig, BertModel, BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_constant_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


'GeForce RTX 2080 Ti'

# Load data & pre-processing

In [2]:
def preprocessing(df):
    """
    Preprocessing step
    As above dataframe heads, there is a lot of <br /> character 
    """
    df.sentence = df.sentence.str.replace('<br />','')
    df.sentence = df.sentence.str.lower()
    return df

train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

train = preprocessing(train)
test = preprocessing(test)

print('Train data:\n{}\n\nTest data:\n{}'.format(train.head(5), test.head(5)))

Train data:
                                            sentence  sentiment  polarity
0  this is a very bland and inert production of o...          2         0
1  i've seen this film in avant-premiere at imagi...          7         1
2  revolt of the zombies (2 outta 5 stars) no, th...          4         0
3  may contain minor spoilers.dressed to kill, ha...          7         1
4  (spoilers)i shoulda figured. the dvd didn't ev...          2         0

Test data:
                                            sentence  sentiment  polarity
0  i loved this movie so much. i'm a big fan of a...         10         1
1  the stark, cold landscape of big sky country, ...          9         1
2  this cheapo exploitation flick is some genuine...          2         0
3  this movie has been promoting in everywhere in...          1         0
4  this is a great off-the-wall romantic comedy a...          8         1


# Tokenization & Create inputs for model

In [3]:
class BertModelBonz():
    def __init__(self, load_model=None, load_config=None, model='bert-base-uncased', max_len=512, batch_size=6):
        self.pre_trained_model = model
        self.max_len = max_len
        self.batch_size = batch_size
        # Setting model
        if load_model is not None:
            self.model = torch.load(load_model)
        elif load_config is not None:
            self.model = BertForSequenceClassification(load_config)
            self.max_len = load_config.max_position_embeddings
        else:    
            self.model = BertForSequenceClassification.from_pretrained(self.pre_trained_model)
        self.tokenizer = BertTokenizer.from_pretrained(self.pre_trained_model)
        self.tokenizer.max_len = max_len
        self.optimizer = AdamW(params = self.model.parameters(), lr=1e-5)
    
    def create_ids(self, sentences):
        logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR) #Disable tokenizer logs, it's really annoy
        input_ids = []
        for sen in tqdm_notebook(sentences, desc="Create Ids"):
            tmp = self.tokenizer.encode(sen)
            input_ids.append(tmp)
        input_ids = pad_sequences(input_ids, 
                                  maxlen=self.max_len, 
                                  dtype='int64', 
                                  truncating='post', 
                                  padding='post')
        return input_ids
    
    def prepare_data(self, input_ids, input_labels=None):
        input_ids = torch.tensor(self.create_ids(input_ids))
        if input_labels is None:
            return DataLoader(TensorDataset(input_ids), 
                              batch_size=self.batch_size)
        else:
            input_labels = torch.tensor(input_labels)
            return DataLoader(TensorDataset(input_ids, input_labels), 
                              batch_size=self.batch_size)
        
    def flat_accuracy(self, preds, labels):
        pred_flat = np.argmax(preds, axis=1).flatten()
        labels_flat = labels.flatten()
        return np.sum(pred_flat == labels_flat) / len(labels_flat)
    
    def train(self, dataloader, epochs=4):
        self.train_loss_set =[]
        for _ in trange(epochs, desc="Epoch"):
            # Training model
            self.model.to(device)
            self.model.train()
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for input_ids, input_labels in tqdm_notebook(dataloader):
                self.optimizer.zero_grad()
                loss = self.model(input_ids=input_ids.cuda(), labels=input_labels.cuda())[0]
                self.train_loss_set.append(loss)    
                loss.backward()
                self.optimizer.step()
                
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
            print("Train loss: {}".format(tr_loss/nb_tr_steps))

            # Evaluation
            self.model.eval()
            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            for input_ids, input_labels in dataloader:
                with torch.no_grad():
                    logits = self.model(input_ids.cuda())[0]
                logits = logits.detach().cpu().numpy()
                label_ids = input_labels.to('cpu').numpy()
                tmp_eval_accuracy = self.flat_accuracy(logits, label_ids)
                eval_accuracy += tmp_eval_accuracy
                nb_eval_steps += 1
            print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
            
    def predict(self, test_data, test_labels):
        test_ids = self.create_ids(test_data)
        test_inputs = torch.tensor(test_ids)
        test_dataloader = DataLoader(test_inputs, batch_size=self.batch_size)
        
        # Preditcion
        self.model.to(device)
        self.model.eval()
        self.predictions = []
        for input_ids in tqdm_notebook(test_dataloader, desc="Predicting"):
            with torch.no_grad():
                logits = self.model(input_ids.to(device))[0]
            logits = logits.detach().cpu().numpy()
            self.predictions.append(logits)
        self.predictions = [j for i in self.predictions for j in i]
        self.predictions = np.argmax(self.predictions, axis=1)
        print(classification_report(self.predictions, test_labels))



In [4]:
bert_model = BertModelBonz(load_config=BertConfig(max_position_embeddings=, 
                                                  num_labels=2), 
                           batch_size=2)


#Train model

train.dataloader = bert_model.prepare_data(input_ids=train['sentence'], input_labels=train['polarity'])
bert_model.train(train.dataloader)
#torch.save(bert_model.model, 'bert_eb1024_1e5_e4.pth')



"""
#Predict model
bert_model.predict(test_data=test.sentence, test_labels=test.polarity)
"""

HBox(children=(IntProgress(value=0, description='Create Ids', max=25000, style=ProgressStyle(description_width…




  
Epoch:   0%|                                                                                     | 0/4 [00:00<?, ?it/s]

HBox(children=(IntProgress(value=0, max=12500), HTML(value='')))


Train loss: 0.7058546250605583
Validation Accuracy: 0.5


Epoch:  25%|█████████████████▊                                                     | 1/4 [1:20:43<4:02:09, 4843.28s/it]

HBox(children=(IntProgress(value=0, max=12500), HTML(value='')))

KeyboardInterrupt: 

In [5]:
bert_model.model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(1024, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0]

In [17]:
from sys import getsizeof
import numpy as np
import pandas as pd

a = [[1,2,3,4,5,6,7,8,9,10],[1,2,3,4,5,6,7,8,9,10],[1,2,3,4,5,6,7,8,9,10],[1,2,3,4,5,6,7,8,9,10],[1,2,3,4,5,6,7,8,9,10]]
b = np.array(a)
c = pd.DataFrame({'c': a}).c.values
print(getsizeof(a))
print(getsizeof(b))
print(getsizeof(c))
print(c.shape)

104
312
96
(5,)
