## Fine Turning Sentiment Analysis

In [1]:
from transformers import pipeline
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from pprint import pprint

In [2]:
from datasets import load_dataset

In [3]:
raw_datasets =  load_dataset("glue", "sst2")

In [4]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [5]:
raw_datasets['train']

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 67349
})

In [6]:
dir(raw_datasets['train'])

['_TF_DATASET_REFS',
 '__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getitems__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_local_temp_path',
 '_check_index_is_initialized',
 '_data',
 '_estimate_nbytes',
 '_fingerprint',
 '_format_columns',
 '_format_kwargs',
 '_format_type',
 '_generate_tables_from_cache_file',
 '_generate_tables_from_shards',
 '_get_cache_file_path',
 '_get_output_signature',
 '_getitem',
 '_indexes',
 '_indices',
 '_info',
 '_map_single',
 '_new_dataset_with_indices',
 '_output_all_columns',
 '_push_parquet_shards_to_hub',
 '_save_to_disk_single',
 '_select_contiguous',
 '_select_wi

In [7]:
type(raw_datasets['train'])

datasets.arrow_dataset.Dataset

In [8]:
raw_datasets['train'].data

MemoryMappedTable
sentence: string
label: int64
idx: int32
----
sentence: [["hide new secretions from the parental units ","contains no wit , only labored gags ","that loves its characters and communicates something rather beautiful about human nature ","remains utterly satisfied to remain the same throughout ","on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ",...,"you wish you were at home watching that movie instead of in the theater watching this one ","'s no point in extracting the bare bones of byatt 's plot for purposes of bland hollywood romance ","underdeveloped ","the jokes are flat ","a heartening tale of small victories "],["suspense , intriguing characters and bizarre bank robberies , ","a gritty police thriller with all the dysfunctional family dynamics one could wish for ","with a wonderful ensemble cast of characters that bring the routine day to day struggles of the working class to life ","nonetheless appreciates the art and reveals a music sc

In [9]:
raw_datasets['train'][0]

{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'idx': 0}

In [10]:
raw_datasets['train'][5000:5003]

{'sentence': ['entirely stale concept ',
  'will amuse or entertain them ',
  'wobbly premise work '],
 'label': [0, 1, 0],
 'idx': [5000, 5001, 5002]}

In [11]:
raw_datasets['train'].features

{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [12]:
from transformers import AutoTokenizer

In [13]:
# checkpoint = "bert-base-uncased"
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [14]:
tokenizer_sentences = tokenizer(raw_datasets['train'][0:3]['sentence'])
pprint(tokenizer_sentences)

{'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
 'input_ids': [[101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102],
               [101,
                3397,
                2053,
                15966,
                1010,
                2069,
                4450,
                2098,
                18201,
                2015,
                102],
               [101,
                2008,
                7459,
                2049,
                3494,
                1998,
                10639,
                2015,
                2242,
                2738,
                3376,
                2055,
                2529,
                3267,
                102]]}


In [34]:
# tokenizer function
def tokenize_fn(batch):
    return tokenizer(batch['sentence'], truncation=True)

In [35]:
tokenized_datasets = raw_datasets.map(tokenize_fn, batched=True)

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

In [36]:
from transformers import TrainingArguments

In [37]:
training_args = TrainingArguments(
    'my_trainer',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    num_train_epochs=1,
)

In [38]:
from transformers import AutoModelForSequenceClassification

In [39]:
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=2
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
type(model)

transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification

In [41]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [42]:
!pip install torchinfo



In [43]:
from torchinfo import summary
summary(model)

Layer (type:depth-idx)                                  Param #
DistilBertForSequenceClassification                     --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              23,440,896
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Linear: 1-2                                           590,592
├─Linear: 1-3                                           1,538
├─Dropout: 1-4                                          --
Total params: 66,955,010
Trainable params: 66,955,010
Non-trainable params: 0

In [44]:
summary(model, input_size=(16,512), dtypes=['torch.IntTensor'], device='cpu')

Layer (type:depth-idx)                                  Output Shape              Param #
DistilBertForSequenceClassification                     [16, 2]                   --
├─DistilBertModel: 1-1                                  [16, 512, 768]            --
│    └─Embeddings: 2-1                                  [16, 512, 768]            --
│    │    └─Embedding: 3-1                              [16, 512, 768]            23,440,896
│    │    └─Embedding: 3-2                              [1, 512, 768]             393,216
│    │    └─LayerNorm: 3-3                              [16, 512, 768]            1,536
│    │    └─Dropout: 3-4                                [16, 512, 768]            --
│    └─Transformer: 2-2                                 [16, 512, 768]            --
│    │    └─ModuleList: 3-5                             --                        42,527,232
├─Linear: 1-2                                           [16, 768]                 590,592
├─Dropout: 1-3                 

In [45]:
# save the initial parameters of the model before using, when fine-turning, we end up training all
# the parameters in the network
params_before = []
for name, p in model.named_parameters():
    params_before.append(p.detach().cpu().numpy())

In [46]:
from transformers import Trainer

In [47]:
from datasets import load_metric

In [48]:
metric = load_metric("glue", "sst2")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [49]:
# pass in dummy data
metric.compute(predictions=[1,0,1], references=[1,0,0])

{'accuracy': 0.6666666666666666}

In [50]:
# define custome compute_metrics function
def compute_metrics(logits_and_labels):
    # metric = load("glue", "sst2")
    logits, labels = logits_and_labels
    predictions = npm.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


In [51]:
trainer = Trainer(
    model,
    training_args,
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics = compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [52]:
trainer.train()

Epoch,Training Loss,Validation Loss


NameError: name 'npm' is not defined

In [53]:
trainer.save_model('my_saved_model')

In [54]:
!ls

'ls' is not recognized as an internal or external command,
operable program or batch file.


In [55]:
# load model
newmodel = pipeline('text-classification', model='my_saved_model', device=0)

In [56]:
newmodel('This movie is great!')

[{'label': 'LABEL_1', 'score': 0.9990463852882385}]

In [57]:
newmodel('This movie sucks')

[{'label': 'LABEL_0', 'score': 0.9967321157455444}]

In [58]:
import json

In [63]:
config_path = 'my_saved_model/config.json'
with open(config_path) as f:
    j = json.load(f)

j['id2label'] = {0: 'negative', 1: 'positive'}

with open(config_path, 'w') as f:
    json.dump(j, f, indent=2)

In [65]:
newmodel = pipeline('text-classification', model='my_saved_model', device=0)

In [66]:
newmodel('This movie is great!')

[{'label': 'positive', 'score': 0.9990463852882385}]

In [68]:
params_after = []
for name, p in model.named_parameters():
    params_after.append(p.detach().cpu().numpy())

In [70]:
for p1, p2 in zip(params_before, params_after):
    print(np, sum(np.abs(p1 - p2)))

<module 'numpy' from 'C:\\Users\\kaizi\\anaconda3\\lib\\site-packages\\numpy\\__init__.py'> [16.838657 17.374462 16.467867 17.518108 19.365883 20.695784 17.219893
 17.600428 16.72181  16.928194 16.396194 17.736319 16.436676 17.261923
 16.697052 17.362549 16.586794 17.823784 16.839054 16.435575 16.217005
 16.68218  17.359646 16.595253 16.421598 17.370405 17.326672 17.140968
 16.268656 20.046347 17.089739 16.914143 16.628374 16.44189  17.06618
 17.086155 16.97792  18.149485 19.506922 18.11872  17.351877 18.752085
 17.426916 17.113272 18.795778 19.395184 16.50269  18.209927 19.48037
 18.319157 16.799143 17.101822 19.893597 16.348206 16.405632 17.731937
 16.60084  18.121758 19.622362 17.134548 18.399366 16.593405 16.28056
 16.723726 17.872194 18.347198 16.848799 16.524311 20.011345 16.813583
 16.732458 17.02069  16.471708 16.546862 19.166025 20.346783 16.647612
 17.597153 17.447863 16.920732 16.734444 18.605034 18.374483 16.739454
 16.592447 20.664219 17.214676 17.888172 18.19235  17.69122

<module 'numpy' from 'C:\\Users\\kaizi\\anaconda3\\lib\\site-packages\\numpy\\__init__.py'> [0.9621704  1.0044731  0.9062946  ... 1.0020876  1.1927615  0.81316274]
<module 'numpy' from 'C:\\Users\\kaizi\\anaconda3\\lib\\site-packages\\numpy\\__init__.py'> 0.9408429823379265
<module 'numpy' from 'C:\\Users\\kaizi\\anaconda3\\lib\\site-packages\\numpy\\__init__.py'> 1.3160230070352554
<module 'numpy' from 'C:\\Users\\kaizi\\anaconda3\\lib\\site-packages\\numpy\\__init__.py'> 0.6369855376251508
<module 'numpy' from 'C:\\Users\\kaizi\\anaconda3\\lib\\site-packages\\numpy\\__init__.py'> [1.5169121  1.4469173  0.9590959  1.2248399  1.0137361  1.0947111
 1.0360843  0.8576314  0.8548131  0.9803916  0.90253997 1.2142256
 1.0232309  0.957461   0.9163346  1.065058   1.0796217  1.5531063
 0.8504425  1.3050941  0.9382553  0.99054855 1.6986247  1.0440612
 1.1006984  0.73271245 1.3817399  1.535639   1.4691976  1.0718545
 1.2173321  1.2479494  1.6196114  1.0552602  1.084013   0.7570366
 1.0007368  0.9

In [71]:
# all weights are changed