In [5]:
import torch
import numpy as np
import pandas as pd
import logging
import time
import torch.nn as nn
import tensorflow as tf 

from tensorflow import keras
from tensorflow.keras import layers
from transformers import BertConfig, BertForSequenceClassification, BertTokenizer, BertTokenizerFast, BertModel, AdamW, TFBertModel
from transformers.modeling_bert import BertEmbeddings, BertSelfAttention
from torch.utils.data import Dataset, DataLoader
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report, accuracy_score, f1_score
from tqdm import trange, tqdm, tqdm_notebook, tqdm_pandas, tqdm_gui

logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)


PRE_TRAINED = 'bert-base-uncased'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'


class MyDataset(Dataset):
    def __init__(self, x, y=None):
        super(MyDataset, self).__init__()
        self.x = x
        self.y = y
        self.tokenizer = BertTokenizerFast.from_pretrained(PRE_TRAINED)
        
    def __getitem__(self, i):
        sen = self.x[i]
        encoded = self.tokenizer.encode(sen)
        encoded = pad_sequences([encoded], maxlen=512, padding='post')
        if self.y is None:
            return torch.FloatTensor(encoded[0])
        else:
            return torch.LongTensor(encoded[0]), torch.FloatTensor([self.y[i]])
    
    def __len__(self):
        return self.x.size
    

class NewBert(nn.Module):
    def __init__(self, config):
        super(NewBert, self).__init__()
        #self.bert = BertModel.from_pretrained(PRE_TRAINED)
        self.bert = BertModel(config)
        self.classifier = nn.Linear(768, 1)
        self.activation = nn.Sigmoid()
        
    
    def forward(self, x, y=None):
        loss_fn = nn.BCELoss()
        last_hidden_state = self.bert(x)[0]
        CLS_token = last_hidden_state[:,0]
        x = self.classifier(CLS_token)
        x = self.activation(x)
        if y is not None:
            loss = loss_fn(x, y)
            return loss, x
        else:
            return x
    
    
def get_data():
    data = pd.read_csv('data.csv', sep='\t', encoding='utf8', error_bad_lines=False)
    data['sentiment'] = [1 if rating > 3 else 0 for rating in data['Review\'s Star Rating'].values]
    data['num_words'] = [len(content.split(' ')) for content in data['Review\'s Content'].values]
    
    return data


def get_model():
    config = BertConfig(num_labels=1)
    model = BertForSequenceClassification(config=config)
    sd = model.state_dict()
    
    bert = BertModel.from_pretrained(PRE_TRAINED)
    bert_sd = bert.state_dict()
    
    for key in bert_sd.keys():
        sd['bert.'+key] = bert_sd[key]
            
    model.load_state_dict(sd)
    
    return config, model


def train(model=None, epochs=None):
    optimizer = AdamW(model.parameters(), lr=1e-5)
    model.train()
    model.to(DEVICE)
    
    for epoch in range(epochs):
        my_dataset = MyDataset(x=data['Review\'s Content'].values, y=data.sentiment.values)
        dataloader = DataLoader(my_dataset, batch_size=4, shuffle=True)
        s = time.time()
        for x, y in dataloader:
            optimizer.zero_grad()
            loss = model(x=x.to(DEVICE), y=y.to(DEVICE))[0]
            loss.backward()
            optimizer.step()
        print('Finish epoch {}, running time {}'.format(epoch+1, time.time()-s))
            
    model.eval()
    predicts=[]
    y_true=[]
    for x, y in dataloader:
        with torch.no_grad():
            predict = model(x=x.to(DEVICE))
        predict = predict.detach().cpu().numpy()
        predict = predict > 0.5
        predicts.extend(predict.tolist())
        y_true.extend(y.numpy().tolist())
        
    print(classification_report(y_true, predicts))
    return model
        
        
if __name__ == '__main__':
    data = get_data()
    data = data.sample(frac=1).reset_index(drop=True).iloc[:100]
    
    
    # Config model
    config = BertConfig(num_labels=1)
    model = NewBert(config)
    
    # Train model
    #model = train(model, 3)
    #torch.save(model.state_dict(), 'weights.h5')

    

b'Skipping line 252160: expected 12 fields, saw 13\n'


In [99]:
class BertBonz(BertModel):
    def __init__(self, config):
        super(BertBonz, self).__init__(config)
        self.embeddings.add_module('llr_embeddings', nn.Embedding(3, 768, 0))
        
    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, llr_ids=None, inputs_embeds=None):
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            input_shape = inputs_embeds.size()[:-1]

        seq_length = input_shape[1]
        device = input_ids.device if input_ids is not None else inputs_embeds.device
        if position_ids is None:
            position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
            position_ids = position_ids.unsqueeze(0).expand(input_shape)
        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)

        if inputs_embeds is None:
            inputs_embeds = self.embeddings.word_embeddings(input_ids)
        position_embeddings = self.embeddings.position_embeddings(position_ids)
        token_type_embeddings = self.embeddings.token_type_embeddings(token_type_ids)
        llr_embeddings = self.embeddings.llr_embeddings(llr_ids)

        embeddings = inputs_embeds + position_embeddings + token_type_embeddings + llr_embeddings
        print(llr_embeddings)
        embeddings = self.embeddings.LayerNorm(embeddings)
        embeddings = self.embeddings.dropout(embeddings)
        return embeddings
    
    
model1 = BertBonz(config)

In [100]:
a = torch.tensor([[101, 102, 103]])
b = torch.tensor([[0, 1, 2]])

model1.eval()
model1(a, None, None, b)

tensor([[[-0.8617, -0.6445,  1.7016,  ...,  1.0401, -1.2517,  0.6840],
         [ 0.1193,  1.1274,  0.5836,  ...,  0.2164, -0.7302,  1.2341],
         [ 0.5983, -0.5080,  0.3349,  ..., -1.1406, -1.3928,  1.2663]]],
       grad_fn=<NativeLayerNormBackward>)

In [97]:
df = BertModel.from_pretrained(PRE_TRAINED).state_dict()
sd = model1.state_dict()
for k, v in df.items():
    print(k, ' ', k in sd.keys())

embeddings.word_embeddings.weight   True
embeddings.position_embeddings.weight   True
embeddings.token_type_embeddings.weight   True
embeddings.LayerNorm.weight   True
embeddings.LayerNorm.bias   True
encoder.layer.0.attention.self.query.weight   True
encoder.layer.0.attention.self.query.bias   True
encoder.layer.0.attention.self.key.weight   True
encoder.layer.0.attention.self.key.bias   True
encoder.layer.0.attention.self.value.weight   True
encoder.layer.0.attention.self.value.bias   True
encoder.layer.0.attention.output.dense.weight   True
encoder.layer.0.attention.output.dense.bias   True
encoder.layer.0.attention.output.LayerNorm.weight   True
encoder.layer.0.attention.output.LayerNorm.bias   True
encoder.layer.0.intermediate.dense.weight   True
encoder.layer.0.intermediate.dense.bias   True
encoder.layer.0.output.dense.weight   True
encoder.layer.0.output.dense.bias   True
encoder.layer.0.output.LayerNorm.weight   True
encoder.layer.0.output.LayerNorm.bias   True
encoder.layer.1

In [50]:
a

array([[101, 102, 103]])

In [63]:
config.num_attention_heads, config.hidden_size, config.hidden_act\
, config.intermediate_size


(12, 768, 'gelu', 3072)

In [17]:
model2

<transformers.modeling_tf_bert.TFBertModel at 0x1a42ee53898>

In [66]:
model.bert.embeddings.word_embeddings(torch.LongTensor([0,1,2,3]).cuda())

tensor([[-0.0102, -0.0615, -0.0265,  ..., -0.0199, -0.0372, -0.0098],
        [-0.0117, -0.0600, -0.0323,  ..., -0.0168, -0.0401, -0.0107],
        [-0.0198, -0.0627, -0.0326,  ..., -0.0165, -0.0420, -0.0032],
        [-0.0185, -0.0574, -0.0384,  ..., -0.0191, -0.0387, -0.0146]],
       device='cuda:0', grad_fn=<EmbeddingBackward>)

In [47]:
model.bert.embeddings.position_embeddings.weight

Parameter containing:
tensor([[ 0.0181, -0.0256, -0.0371,  ..., -0.0006,  0.0007,  0.0157],
        [ 0.0071,  0.0024, -0.0192,  ...,  0.0283,  0.0304, -0.0059],
        [-0.0114, -0.0020, -0.0119,  ...,  0.0147,  0.0186, -0.0074],
        ...,
        [ 0.0174,  0.0034, -0.0097,  ...,  0.0030,  0.0005, -0.0273],
        [ 0.0210, -0.0065,  0.0149,  ..., -0.0056, -0.0124, -0.0288],
        [ 0.0029, -0.0238,  0.0055,  ...,  0.0175,  0.0274, -0.0784]],
       device='cuda:0', requires_grad=True)

In [60]:
t = torch.rand((2, 1, 5, 5))
t2 = torch.rand((2, 1, 5, 6))
torch.matmul(t, t2).size()

torch.Size([2, 1, 5, 6])

In [61]:
model

NewBert(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    