In [54]:
from datasets import load_dataset, list_datasets, list_metrics, load_from_disk, Dataset
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch

In [6]:
df = pd.read_csv('data.csv')

In [10]:
ds = Dataset.from_pandas(df)
ds

Dataset({
    features: ['sentence_a', 'sentence_b', 'label_id'],
    num_rows: 247568
})

In [13]:
ds = ds.train_test_split(test_size=0.2)

In [14]:
ds

DatasetDict({
    train: Dataset({
        features: ['sentence_a', 'sentence_b', 'label_id'],
        num_rows: 198054
    })
    test: Dataset({
        features: ['sentence_a', 'sentence_b', 'label_id'],
        num_rows: 49514
    })
})

In [17]:
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

In [47]:
def encode(e):
    inputs = tokenizer(e['sentence_a'][0], max_length=50, truncation=True, padding='max_length')
    inputs = dict(zip(inputs.keys(), map(torch.tensor, inputs.values())))
    label = e['label_id'][0]
    return {'inputs':[inputs], 'label':[label]}

In [48]:
ds.set_transform(encode)

In [50]:
print(ds['train'][0])

{'inputs': {'input_ids': tensor([ 101, 6369, 5050, 3322, 1724, 5277,  680, 6369, 5050, 3322, 5440, 4777,
        1525,  702, 7410, 8043,  102,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0])}, 'label': 0}


In [51]:
loader = DataLoader(ds['train'], batch_size=2)

In [56]:
inputs = next(iter(loader))['inputs']

In [55]:
bert = BertModel.from_pretrained('bert-base-chinese')

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [62]:
outputs = bert(**inputs)

In [63]:
outputs

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 4.8468e-01, -1.5569e-01, -1.7471e-01,  ...,  1.6859e-01,
          -5.5856e-01,  3.8753e-01],
         [ 4.4400e-01,  5.8182e-01,  1.2014e+00,  ..., -5.0518e-02,
          -6.3436e-01, -2.6499e-02],
         [ 6.0337e-01, -1.6424e-01,  8.4813e-02,  ...,  2.6409e-01,
           2.1916e-01, -1.2433e-01],
         ...,
         [ 7.6617e-01,  2.4411e-02, -4.8762e-01,  ...,  7.2499e-01,
          -3.7644e-02,  4.8437e-02],
         [ 6.4181e-01,  1.0708e-01, -6.2491e-01,  ...,  6.9181e-01,
          -2.4639e-01,  1.4376e-01],
         [ 2.0560e-01,  4.8218e-01, -8.6308e-01,  ...,  9.8600e-03,
          -1.0057e-01,  1.1442e-01]],

        [[-1.2120e-03,  3.7300e-01,  4.8251e-02,  ..., -7.1353e-02,
           3.9020e-02, -1.9349e-01],
         [ 6.2819e-01,  2.9261e-01,  8.9257e-01,  ..., -7.8841e-01,
          -6.4216e-01,  1.4592e-01],
         [ 6.7330e-01,  3.3981e-01,  1.4284e+00,  ..., -1.9897e-01,
           4.