# Step 1: INstall And Import Python Libraries

In [1]:

import sys  
sys.path.insert(0, '/Users/johanneswidera/Uni/bachelorarbeit/Code/models/')


In [2]:
# Data processing
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import contractions
import re
import string

from Sentiment.pipeline.helper import download_data, read_imdb_split

# Modeling
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback, TextClassificationPipeline

# Hugging Face Dataset
from datasets import Dataset

# Model performance evaluation
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


# Step 2: Download And Read Data

In [3]:
download_data()

'Already downloaded and extracted.'

In [4]:
# https://www.aboutdatablog.com/post/how-to-successfully-add-large-data-sets-to-google-drive-and-use-them-in-google-colab

corpus_train, y_train = read_imdb_split('../data/aclImdb/train')
corpus_test, y_test = read_imdb_split('../data/aclImdb/test')


  soup = BeautifulSoup(text, 'html.parser')


In [5]:
from sklearn.model_selection import train_test_split
corpus_train, corpus_val, y_train, y_val = train_test_split(corpus_train, y_train, test_size=.2)

len(corpus_train)

20000

# Step 4: Convert Pandas Dataframe To Hugging Face Dataset


Hugging Face Dataset objects are memory mapped on drive so they are not limited by RAM memory which is very helpful for processing large datasets

In [6]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [7]:
train_encodings = tokenizer(corpus_train, truncation=True, padding=True)
val_encodings = tokenizer(corpus_val, truncation=True, padding=True)
test_encodings = tokenizer(corpus_test, truncation=True, padding=True)

In [8]:
import torch

class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, y_train)
val_dataset = IMDbDataset(val_encodings, y_val)
test_dataset = IMDbDataset(test_encodings, y_test)

# Step 6: Load Pretrained Model




- AutoModelForSequenceClassification loads the BERT model without the sequence classification head.
- The method from_pretrained() loads the weights from the pretrained model into the new model, so the weights in the new model are not randomly initialized. Note that the new weights for the new sequence classification head are going to be randomly initialized.
- bert-base-cased is the name of the pretrained model. We can change it to a different model based on the nature of the project.
- num_labels indicates the number of classes. Our dataset has two classes, positive and negative, so num_labels=2.


In [9]:
# Load model
from transformers import DistilBertForSequenceClassification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier

# Step 7 Set Training Argument

Hugging Face has 96 parameters for TrainingArguments, which provides a lot of flexibility in fine-tuning the transfer learning model.

In [10]:
!pip install transformers[torch]
!pip install tensorboard

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
zsh:1: no matches found: transformers[torch]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
zsh:1: /Users/johanneswidera/Uni/bachelorarbeit/Code/models/HUGGING_ENV/bin/pip: bad interpreter: /Users/johanneswidera/Uni/bachelorarbeit/Code/hugging-face/HUGGING_ENV/bin/python3: no such file or directory


In [11]:
# for visualizing the train/val loss for optimal training

In [12]:
# Function to compute the metric
def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    # probabilities = tf.nn.softmax(logits)
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [13]:
%load_ext tensorboard
%tensorboard --logdir ./logs

ModuleNotFoundError: No module named 'tensorboard'

In [14]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

"""
During the training process, the model is trained on the training set (20k samples) and validated
on the validation set (5k samples). The model is trained for 3 epochs, with a batch size of 16
 for training and 64 for evaluation. The training process also includes a warm-up phase of 500
steps, where the learning rate is gradually increased, and weight decay of 0.01. The evaluation
strategy is set to "steps," meaning the model is evaluated at every logging step

try to find the best property for batch_size
1st try:
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,

    abbruch:
      - to slow
      -  [ 591/3750 1:27:44 < 7:50:36, 0.11 it/s, Epoch 0.47/3]

2nd try:
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,

"""

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
    evaluation_strategy="steps"
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    compute_metrics=compute_metrics,
)

trainer.train()


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 1/3750 [00:08<9:09:36,  8.80s/it]

KeyboardInterrupt: 

## Explain trainng

During the training process, the model is trained on the training set (20k samples) and validated
on the validation set (5k samples). The model is trained for 3 epochs, with a batch size of 16
 for training and 64 for evaluation. The training process also includes a warm-up phase of 500
steps, where the learning rate is gradually increased, and weight decay of 0.01. The evaluation
strategy is set to "steps," meaning the model is evaluated at every logging step

## find best params
The evaluation number 625 comes from the number of evaluation batches.
 It is calculated by dividing the number of samples in the validation dataset (5,000) by the per_device_eval_batch_size (8 in your case, as shown in the search results). To reduce the number of evaluation batches and speed up the evaluation process, you can do the following

 increase

 - per_device_eval_batch_size
 - logging_steps


Step 8: Set Evaluation Metrics

In step 8, we will set the evaluation metric because Hugging Face Trainer does not evaluate the model performance automatically during the training process.

# Step9: Train Model Using Transformer Trainer

# Step 11: Evaluate Model Performance

In [None]:
# Trainer evaluate
trainer.evaluate(test_dataset)

# Step 12: Save and Load The Model

In [None]:
# Save tokenizer
tokenizer.save_pretrained('./sentiment_transfer_learning_transformer/')

# Save model
trainer.save_model('./sentiment_transfer_learning_transformer/')

In [None]:
# !zip -r sentiment_transfer_learning_transformer.zip sentiment_transfer_learning_transformer/

In [None]:


tokenizer = AutoTokenizer.from_pretrained("../../FineTunedBERT/Sentiment/24060142")
# Load model
loaded_model = AutoModelForSequenceClassification.from_pretrained("../../FineTunedBERT/Sentiment/24060142")
model_bert = pipeline('sentiment-analysis', model=loaded_model, tokenizer=tokenizer, max_length=512, truncation=True, top_k=None)

# Step 13: Analysis with SHAP