In [1]:
import os
import sys
import itertools
import pickle
from glob import glob
from tqdm import tqdm_notebook as tqdm

import numpy as np
import pandas as pd
from scipy.stats import spearmanr

from matplotlib import pyplot as plt
from matplotlib_venn import venn2, venn3
import seaborn as sns

import torch
from torch import nn, optim
from transformers import BertConfig, BertTokenizer, BertModel, BertForMaskedLM#, BertLayer, BertEmbeddings
from transformers import XLNetModel, GPT2Model, RobertaModel
from transformers.modeling_bert import BertLayer, BertEmbeddings

In [2]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

# re-load functions
%load_ext autoreload
%autoreload 2

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
%config InlineBackend.figure_formats = {'png', 'retina'}

In [3]:
DEVICE = 'cpu'

## save

In [4]:
bert_model = BertModel.from_pretrained('bert-base-uncased')
roberta_model = RobertaModel.from_pretrained('roberta-base')
xlnet_model = XLNetModel.from_pretrained('xlnet-base-cased')
gpt2_model = GPT2Model.from_pretrained('gpt2')

In [7]:
ls ../mnt/datasets/model_configs

bert-model-uncased-config.pkl  dataset-metadata.json


In [5]:
with open('../mnt/datasets/bert-model-uncased-config.pkl', 'wb') as fout:
    pickle.dump(bert_model.config, fout)
with open('../mnt/datasets/roberta-model-base-config.pkl', 'wb') as fout:
    pickle.dump(roberta_model.config, fout)
with open('../mnt/datasets/xlnet-model-base-cased-config.pkl', 'wb') as fout:
    pickle.dump(xlnet_model.config, fout)
with open('../mnt/datasets/gpt2-model-config.pkl', 'wb') as fout:
    pickle.dump(gpt2_model.config, fout)

In [12]:
with open('../mnt/datasets/model_configs/roberta-model-base-config.pkl', 'rb') as fin:
    config = pickle.load(fin)

In [13]:
model = RobertaModel(config)

In [14]:
model.to('cuda')

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inpl

In [16]:
import transformers
transformers.__version__

'2.3.0'

## sandbox

In [25]:
roberta_model.state_dict()['embeddings.token_type_embeddings.weight'].shape[0]

1

In [15]:
roberta_model.state_dict()

OrderedDict([('embeddings.word_embeddings.weight',
              tensor([[ 0.1476, -0.0365,  0.0753,  ..., -0.0023,  0.0172, -0.0016],
                      [ 0.0156,  0.0076, -0.0118,  ..., -0.0022,  0.0081, -0.0156],
                      [-0.0347, -0.0873, -0.0180,  ...,  0.1174, -0.0098, -0.0355],
                      ...,
                      [ 0.0304,  0.0504, -0.0307,  ...,  0.0377,  0.0096,  0.0084],
                      [ 0.0623, -0.0596,  0.0307,  ..., -0.0920,  0.1080, -0.0183],
                      [ 0.1259, -0.0145,  0.0332,  ...,  0.0121,  0.0342,  0.0168]])),
             ('embeddings.position_embeddings.weight',
              tensor([[-0.0115,  0.0204,  0.0197,  ...,  0.0050, -0.0274, -0.0439],
                      [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
                      [ 0.0346, -0.0169, -0.0895,  ..., -0.0542,  0.0291,  0.0173],
                      ...,
                      [ 0.1191, -0.0587, -0.0396,  ..., -0.0179,  0.1249,  0.0205

In [10]:
xlnet_model.config

{
  "architectures": [
    "XLNetLMHeadModel"
  ],
  "attn_type": "bi",
  "bi_data": false,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "dropout": 0.1,
  "end_n_top": 5,
  "ff_activation": "gelu",
  "finetuning_task": null,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "mem_len": null,
  "n_head": 12,
  "n_layer": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "reuse_len": null,
  "same_length": false,
  "start_n_top": 5,
  "summary_activation": "tanh",
  "summary_last_dropout": 0.1,
  "summary_type": "last",
  "summary_use_proj": true,
  "torchscript": false,
  "untie_r": true,
  "use_bfloat16": false,
  "vocab_size": 32000
}

In [27]:
torch.zeros(412, dtype=torch.long)

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,