<a href="https://colab.research.google.com/github/gnudennis/applied-image-processing-with-deep-learning/blob/main/fine_tune_transformers_on_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets evaluate transformers[sentencepiece] accelerate

In [None]:
import torch
from torch import nn
import transformers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
print(torch.__version__)
print(transformers.__version__)

In [None]:
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 200

## text classification

- 也叫 sequence classification
- sentiment analysis
  - 情感分析，也是文本/序列分类
    - 电商评价
    - social web: weibo/tweet

#### emotions数据集

In [None]:
from datasets import load_dataset

In [None]:
emotions = load_dataset('emotion')

In [None]:
# DatasetDict
# 8:1:1
emotions

In [None]:
emotions.keys()

In [None]:
print(emotions['train'], type(emotions['train']))
print(emotions['train']['text'][:5])
print(emotions['train']['label'][:5])
print(emotions['train'][:5])

In [None]:
print(emotions['train'].features, type(emotions['train'].features))
print(emotions['train'].features['label'])
print(emotions['train'].features['label'].int2str(0))
print(emotions['train'].features['label'].names)

In [None]:
def int2str(idx):
  labels = emotions['train'].features['label'].names
  num_classes = len(labels)
  if idx < 0 or idx >= num_classes:
    raise ValueError(f'Invalid integer class label {idx}')
  return labels[idx]
  # return emotions['train'].features['label'].int2str(idx)

In [None]:
int2str(2)

### data visualization analysis

- dataset ==> dataFrame
- label analysis: label freq
- text length

#### dataset to dataframe

In [None]:
emotions_df = pd.DataFrame.from_dict(emotions['train'])
print(emotions_df.shape, emotions_df.columns)
emotions_df

In [None]:
emotions_df['label_name'] = emotions_df['label'].apply(lambda x: int2str(x))
emotions_df[:5]

#### label analysis

In [None]:
emotions_df.label.value_counts()

In [None]:
print(type(emotions_df.label_name.value_counts()))
emotions_df.label_name.value_counts()

In [None]:
plt.figure(figsize=(4, 3))
emotions_df['label_name'].value_counts(ascending=True).plot.barh()
plt.title('freq of labels')

#### text length analysis

In [None]:
plt.figure(figsize=(4, 3))
emotions_df['words per tweet'] = emotions_df['text'].str.split().apply(len)
emotions_df.boxplot('words per tweet', by='label_name',
                    # showfliers=False,
                    grid=False,
                    color='black')
plt.suptitle('')
plt.xlabel('')

In [None]:
print(emotions_df['words per tweet'].max())
print(emotions_df['words per tweet'].idxmax())

In [None]:
print(emotions_df.iloc[emotions_df['words per tweet'].idxmin()])
print(emotions_df.iloc[emotions_df['words per tweet'].idxmin()].text)

### text => tokens

数据集转换为模型接受的输入类型

- Subword Tokenization
  - WordPiece
    - BERT and DistilBERT
- hugging face:
  - ~/.cache/huggingface/
- tokenizer
  - tokenizer.vocab_size
- model config
  - tokenizer.model_max_length
  - tokenizer.model_input_names

#### tokenizer

In [None]:
from transformers import AutoTokenizer

In [None]:
model_ckpt='distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
# uncased
print(tokenizer.encode('hello world'))
print(tokenizer.encode('Hello world'))
print(tokenizer.encode('HELLO WORLD'))

In [None]:
tokenizer

In [None]:
print(tokenizer.model_max_length)
print(tokenizer.model_input_names)

In [None]:
for special_id in tokenizer.all_special_ids:
    print(special_id, tokenizer.decode(special_id))

####  tokenize the whole dataset

In [None]:
emotions_encoded = emotions.map(lambda dataset: tokenizer(dataset['text'], padding=True, truncation=True))

In [None]:
emotions_encoded

In [None]:
print(type(emotions_encoded['train']['input_ids']))
emotions_encoded['train']['input_ids'][:3]

In [None]:
# list to tensor
emotions_encoded.set_format('torch', columns=['label', 'input_ids', 'attention_mask'])
emotions_encoded

In [None]:
print(type(emotions_encoded['train']['input_ids']))
emotions_encoded['train']['attention_mask'][:3]