In [1]:
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
import time
import json
import numpy as np
import pandas as pd
import copy
import math
import os

In [48]:
from optimization import *
from tqdm import tqdm, trange

In [50]:
import logging
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)


In [2]:
n_cate=23
n_month=31
n_mask=2
n_samples=10000
n_price_bins=20
num_fea_count=125

In [3]:
num_fea=np.random.randint(n_cate,size=(n_samples,num_fea_count))*0.1
txn_list=np.random.randint(n_cate,size=(n_samples,n_month))
txn_mask=np.random.randint(n_mask,size=(n_samples,n_month))
txn_price=np.random.randint(n_price_bins,size=(n_samples,n_month))
labels=np.random.randint(2,size=(n_samples,1))
txn_pos=[np.array((range(n_month))[::-1]) for i in range(n_samples)]

In [4]:
class InputFeatures(object):
    def __init__(self, input_ids, input_pos, input_mask, input_price, num_fea, label_id):
        self.input_ids = input_ids
        self.input_pos = input_pos
        self.input_mask = input_mask
        self.input_price = input_price
        self.num_fea = num_fea
        self.label_id = label_id

In [5]:
input_pos=torch.tensor(txn_pos,dtype=torch.long)
input_ids=torch.tensor(txn_list,dtype=torch.long)
input_mask=torch.tensor(txn_mask,dtype=torch.long)
input_price=torch.tensor(txn_price,dtype=torch.long)
label_id=torch.tensor(labels,dtype=torch.long)
num_fea=torch.tensor(num_fea,dtype=torch.float32)

In [6]:
#train_data=train_data.apply(lambda x:InputFeatures(input_ids=x.input_ids,\
#                                                   input_pos=x.input_pos,\
#                                                   input_mask=x.input_mask,\
#                                                   input_price=x.input_price,\
#                                                   num_fea=x.num_fea, \
#                                                   label_id=x.label_id),axis=1).tolist()

In [7]:
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [8]:
class BertConfig(object):
    """Configuration class to store the configuration of a `BertModel`.
    """
    def __init__(self,
                 vocab_size_or_config_json_file,
                 hidden_size=768,
                 num_hidden_layers=12,
                 num_attention_heads=12,
                 intermediate_size=3072,
                 hidden_act="gelu",
                 hidden_dropout_prob=0.1,
                 attention_probs_dropout_prob=0.1,
                 max_position_embeddings=512,
                 type_vocab_size=2,
                 price_bin_size=20,
                 initializer_range=0.02):
        self.vocab_size = vocab_size_or_config_json_file
        self.hidden_size = hidden_size
        self.price_bin_size=price_bin_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.initializer_range = initializer_range

    @classmethod
    def from_dict(cls, json_object):
        """Constructs a `BertConfig` from a Python dictionary of parameters."""
        config = BertConfig(vocab_size_or_config_json_file=-1)
        for key, value in json_object.items():
            config.__dict__[key] = value
        return config


In [9]:
config_json_object={
    'hidden_size':120,
    'vocab_size':23,
    'price_bin_size':20,
    'num_attention_heads':12,
    'num_hidden_layers':6,
    'max_position_embeddings':32,
    'type_vocab_size':2,
    'hidden_dropout_prob':0.1,
    'sigmoid':False,
}

In [10]:
config=BertConfig.from_dict(config_json_object)

In [12]:
class BertLayerNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-12):
        """Construct a layernorm module in the TF style (epsilon inside the square root).
        """
        super(BertLayerNorm, self).__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.bias = nn.Parameter(torch.zeros(hidden_size))
        self.variance_epsilon = eps

    def forward(self, x):
        u = x.mean(-1, keepdim=True)
        s = (x - u).pow(2).mean(-1, keepdim=True)
        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
        return self.weight * x + self.bias

In [13]:
class BertEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings.
    """
    def __init__(self, config):
        super(BertEmbeddings, self).__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
        self.price_embeddings = nn.Embedding(config.price_bin_size, config.hidden_size)
        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
        # any TensorFlow checkpoint file
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, input_ids, position_ids, token_type_ids, input_price):
        words_embeddings = self.word_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)
        price_embeddings = self.price_embeddings(input_price)
        embeddings = words_embeddings + position_embeddings + token_type_embeddings + price_embeddings
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

In [14]:
class BertSelfAttention(nn.Module):
    def __init__(self, config):
        super(BertSelfAttention, self).__init__()
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                "The hidden size (%d) is not a multiple of the number of attention "
                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(self, hidden_states, attention_mask):
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)

        query_layer = self.transpose_for_scores(mixed_query_layer)
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)

        # Take the dot product between "query" and "key" to get the raw attention scores.
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
        attention_scores = attention_scores + attention_mask
        attention_probs = nn.Softmax(dim=-1)(attention_scores)

        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
        attention_probs = self.dropout(attention_probs)

        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)
        return context_layer


In [15]:
class BertSelfOutput(nn.Module):
    def __init__(self, config):
        super(BertSelfOutput, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


In [16]:
def gelu(x):
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

def swish(x):
    return x * torch.sigmoid(x)

ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}


In [17]:

class BertAttention(nn.Module):
    def __init__(self, config):
        super(BertAttention, self).__init__()
        self.self = BertSelfAttention(config)
        self.output = BertSelfOutput(config)

    def forward(self, input_tensor, attention_mask):
        self_output = self.self(input_tensor, attention_mask)
        attention_output = self.output(self_output, input_tensor)
        return attention_output


class BertIntermediate(nn.Module):
    def __init__(self, config):
        super(BertIntermediate, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        self.intermediate_act_fn = ACT2FN[config.hidden_act] \
            if isinstance(config.hidden_act, str) else config.hidden_act

    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states

class BertOutput(nn.Module):
    def __init__(self, config):
        super(BertOutput, self).__init__()
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


In [18]:
class BertLayer(nn.Module):
    def __init__(self, config):
        super(BertLayer, self).__init__()
        self.attention = BertAttention(config)
        self.intermediate = BertIntermediate(config)
        self.output = BertOutput(config)

    def forward(self, hidden_states, attention_mask):
        attention_output = self.attention(hidden_states, attention_mask)
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output

In [19]:
class BertEncoder(nn.Module):
    def __init__(self, config):
        super(BertEncoder, self).__init__()
        layer = BertLayer(config)
        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])

    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
        all_encoder_layers = []
        for layer_module in self.layer:
            hidden_states = layer_module(hidden_states, attention_mask)
            if output_all_encoded_layers:
                all_encoder_layers.append(hidden_states)
        if not output_all_encoded_layers:
            all_encoder_layers.append(hidden_states)
        return all_encoder_layers


In [20]:
class BertPooler(nn.Module):
    def __init__(self, config):
        super(BertPooler, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()

    def forward(self, hidden_states):
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output

In [21]:
class MLP(nn.Module):
    def __init__(self, config, layers_list, sigmoid=False):
        super(MLP, self).__init__()
        layers=nn.ModuleList()
        try:
            layers_list=[int(m) for m in layers_list.split(',')]
        except:
            "Please input layer structrue as string seperated with ',', e.g.'768,96,48'"
        #layers_list[0]=input_fea.shape[1]#match the contacted layer neuron number with input_fea dim
        for i in range(len(layers_list)-1):
            pre_layer,next_layer=layers_list[i],layers_list[i+1]
            fully_connected_layers=nn.Linear(pre_layer, next_layer, bias=True)
            mean = 0.0 
            std_dev = np.sqrt(2 / (next_layer + pre_layer)) 
            W = np.random.normal(mean, std_dev, size=(next_layer, pre_layer)).astype(np.float32)
            std_dev = np.sqrt(1 / next_layer) 
            b = np.random.normal(mean, std_dev, size=next_layer).astype(np.float32)
            fully_connected_layers.weight.data = torch.tensor(W, requires_grad=True)
            fully_connected_layers.bias.data = torch.tensor(b, requires_grad=True)
            layers.append(fully_connected_layers)
        if sigmoid:
            layers.append(nn.Sigmoid())
        else:
            layers.append(nn.ReLU())
        self.apply_mlp = torch.nn.Sequential(*layers)
        self.first_layer_mlp=layers_list[0]
    
    def forward(self, input_fea):
        if input_fea.shape[1]==int(self.first_layer_mlp):
            return self.apply_mlp(input_fea)
        else:
            raise ValueError('Please make the first MLP layer neuron number match with input layer, '+\
                             'the input fea is '+str(input_fea.shape[1])+', the MLP fist layer is '+str(self.first_layer_mlp))
            

In [23]:
class BertModel(nn.Module):
    def __init__(self, config,layers_list):
        super(BertModel, self).__init__()
        self.embeddings = BertEmbeddings(config)
        self.encoder = BertEncoder(config)
        self.pooler = BertPooler(config)
        self.mlp=MLP(config,layers_list)
    def forward(self, input_ids, input_pos=None, input_mask=None, input_price=None,\
                num_fea=None, output_all_encoded_layers=True, arch_interaction_op='cat'):
        embedding_output = self.embeddings(input_ids, input_pos, input_mask, input_price)
        extended_attention_mask = input_mask.unsqueeze(1).unsqueeze(2)
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
        encoded_layers = self.encoder(embedding_output, extended_attention_mask,\
                                      output_all_encoded_layers=output_all_encoded_layers)
        sequence_output = encoded_layers[-1]
        pooled_output = self.pooler(sequence_output)
        fcl_output=self.mlp(num_fea)
        if not output_all_encoded_layers:
            encoded_layers = encoded_layers[-1]
        if arch_interaction_op=='cat':
            cat_output=torch.cat([fcl_output,pooled_output],dim=1)
        return encoded_layers, cat_output

In [24]:
64+120

184

In [25]:
class BertForSequenceClassification(nn.Module):
    def __init__(self, config, num_labels=2, bottom_layers_list='125,256,64', top_layers_list='125,256,120'):
        super(BertForSequenceClassification, self).__init__()
        self.num_labels = num_labels
        self.bert = BertModel(config, bottom_layers_list)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_labels)
        self.mlp=MLP(config, top_layers_list)
    def forward(self, input_ids, input_pos, input_mask, input_price, num_fea, output_all_encoded_layers=True, arch_interaction_op='cat',labels=None):
        _, pooled_output = self.bert(input_ids, input_pos, input_mask, input_price, num_fea)
        pooled_output=self.mlp(pooled_output)
        pooled_output = self.dropout(pooled_output)
        if config.hidden_size== pooled_output.shape[1]:
            logits = self.classifier(pooled_output)
        else:
            raise ValueError('The pooled_out has size '+str(pooled_output.shape[1])+'which mismatched with hidden size '+str(config.hidden_size))
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            return loss
        else:
            return logits

In [26]:
train_data=pd.DataFrame()
train_data['input_pos']=input_pos
train_data['input_ids']=input_ids
train_data['input_mask']=input_mask
train_data['input_price']=input_price
train_data['label_id']=label_id
train_data['num_fea']=num_fea

In [27]:
train_data = TensorDataset(input_ids, input_pos, input_mask, input_price, num_fea, label_id)

In [28]:
train_batch_size=10
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

In [41]:
num_labels=2
model =BertForSequenceClassification(config, num_labels, bottom_layers_list='125,256,64', top_layers_list='184,125,120')#184 = 64 out + hidden size 120
#logits = model(input_ids, input_pos, input_mask, input_price, num_fea,layers_list='25,100,10')

In [42]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [43]:
n_gpu = torch.cuda.device_count()

In [3]:
model.to(device)

In [45]:
if n_gpu > 1:
    model = torch.nn.DataParallel(model)

In [46]:
learning_rate=5e-5
warmup_proportion=0.01
num_train_epochs=1000
train_batch_size=1000

gradient_accumulation_steps=1
num_train_steps = int(len(train_data) / train_batch_size / gradient_accumulation_steps * num_train_epochs)
t_total=num_train_steps

In [47]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [49]:
optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=learning_rate,
                             warmup=warmup_proportion,
                             t_total=t_total)

In [None]:
do_train=True
global_step = 0
if do_train:
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_data))
    logger.info("  Batch size = %d", train_batch_size)
    logger.info("  Num steps = %d", num_train_steps)
    model.train()
    for _ in trange(int(num_train_epochs), desc="Epoch"):
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_pos, input_mask, input_price, num_fea, label_id = batch
            loss = model(input_ids, input_pos, input_mask, input_price, num_fea, labels=label_id)
            if n_gpu > 1:
                loss = loss.mean() # mean() to average on multi-gpu.
            if gradient_accumulation_steps > 1:
                loss = loss / gradient_accumulation_steps
            
            loss.backward()
            tr_loss += loss.item()
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1
            if (step + 1) % gradient_accumulation_steps == 0:
                # modify learning rate with special warm up BERT uses
                lr_this_step = learning_rate * warmup_linear(global_step/t_total, warmup_proportion)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr_this_step
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

In [None]:
output_dir='./'

In [None]:
model_to_save = model.module if hasattr(model, 'module') else model 
output_model_file = os.path.join(output_dir, "pytorch_model.bin")
torch.save(model_to_save.state_dict(), output_model_file)

In [None]:
def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)

In [None]:
do_eval=True
eval_data=train_data
eval_batch_size=10
if do_eval:
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for step, batch in enumerate(tqdm(eval_dataloader, desc="prediction")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_pos, input_mask, input_price, num_fea, label_id = batch
        with torch.no_grad():
            tmp_eval_loss = model(input_ids, input_pos, input_mask, input_price, num_fea, labels=label_id)
            logits = model(input_ids, input_pos, input_mask, input_price, num_fea)
        logits = logits.detach().cpu().numpy()
        label_id = label_id.to('cpu').numpy()
        tmp_eval_accuracy = accuracy(logits, label_id)
        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy
        nb_eval_examples += input_ids.size(0)
        nb_eval_steps += 1
    eval_loss_avg = eval_loss / nb_eval_steps
    eval_accuracy_avg = eval_accuracy / nb_eval_examples

In [None]:
# Load a trained model that you have fine-tuned
model_state_dict = torch.load(output_model_file)
model.load_state_dict(model_state_dict)
model.to(device)