In [1]:
import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification

In [2]:
import pandas as pd

In [3]:
import numpy as np

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
from transformers import AutoModelForSequenceClassification

In [6]:
from transformers import AutoTokenizer, DataCollatorWithPadding

In [7]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [8]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

## Load and prepare the data

In [9]:
data = {'description': ['Das weiße Tshirt.', 'Ein Kleid mit Blumen.', 'Eine lange Jeanshose.',
                 'Tshirt mit Muster.', 'Hose mit Print und Taschen.', 'Ausfallendes Kleid in lila.',
                 'Lang geschnittenes Shirt aus Wolle.', 'Edles Kleid aus Satin.', 'Culotte mit weitem Bein.',
                 'Kurze Ärmel und weite Passform machen das Shirt zum Allrounder.',
                 'Dicker Stoff macht die Hose zum idealen Winterbegleiter.', 'Kleid mit dünnen Trägern.',
                 'Top mit Print.', 'Schlaghose aus den 70ern.', 'Blaue Jeans mit Blumenapplikationen.',
                 'Gestricktes Kleid mit V-Ausschnitt.', 'Langärmliges Shirt mit Streifen.',
                 'Kurze Shorts mit Taschen.', 'Trägertop in rosa.', 'Hemdblusenkleid aus Strickstoff.', 'Kurze Jeans mit Strassapllikation.', 'Blaues Basic-Shirt mit Rundhalsausschnitt.'],
        'labels': ['Tshirt', 'Kleid', 'Hose', 'Tshirt', 'Hose', 'Kleid', 'Tshirt', 'Kleid', 'Hose', 'Tshirt',
                  'Hose', 'Kleid', 'Tshirt', 'Hose', 'Hose', 'Kleid', 'Tshirt', 'Hose', 'Tshirt', 'Kleid', 'Hose', 'Tshirt']}
df = pd.DataFrame(data)

print(df)


                                          description  labels
0                                   Das weiße Tshirt.  Tshirt
1                               Ein Kleid mit Blumen.   Kleid
2                               Eine lange Jeanshose.    Hose
3                                  Tshirt mit Muster.  Tshirt
4                         Hose mit Print und Taschen.    Hose
5                         Ausfallendes Kleid in lila.   Kleid
6                 Lang geschnittenes Shirt aus Wolle.  Tshirt
7                              Edles Kleid aus Satin.   Kleid
8                            Culotte mit weitem Bein.    Hose
9   Kurze Ärmel und weite Passform machen das Shir...  Tshirt
10  Dicker Stoff macht die Hose zum idealen Winter...    Hose
11                          Kleid mit dünnen Trägern.   Kleid
12                                     Top mit Print.  Tshirt
13                          Schlaghose aus den 70ern.    Hose
14               Blaue Jeans mit Blumenapplikationen.    Hose
15      

### Encode the labels

In [10]:
possible_labels = df.labels.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{'Tshirt': 0, 'Kleid': 1, 'Hose': 2}

In [11]:
df['labels'] = df.labels.replace(label_dict)

In [12]:
df

Unnamed: 0,description,labels
0,Das weiße Tshirt.,0
1,Ein Kleid mit Blumen.,1
2,Eine lange Jeanshose.,2
3,Tshirt mit Muster.,0
4,Hose mit Print und Taschen.,2
5,Ausfallendes Kleid in lila.,1
6,Lang geschnittenes Shirt aus Wolle.,0
7,Edles Kleid aus Satin.,1
8,Culotte mit weitem Bein.,2
9,Kurze Ärmel und weite Passform machen das Shir...,0


### Train and Validation split

In [14]:
train_ratio = 0.5
validation_ratio = 0.2
test_ratio = 0.3

# train is now 75% of the entire data set
X_train, X_val, y_train, y_val = train_test_split(df.description.values, df.labels.values, test_size=1 - train_ratio, random_state=42, stratify=df.labels.values)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=test_ratio/(test_ratio + validation_ratio), random_state=42, stratify=y_val) 

In [22]:
# convert array to list
np.ndarray.tolist(X_train)

['Eine lange Jeanshose.',
 'Das weiße Tshirt.',
 'Blaue Jeans mit Blumenapplikationen.',
 'Kleid mit dünnen Trägern.',
 'Tshirt mit Muster.',
 'Hemdblusenkleid aus Strickstoff.',
 'Blaues Basic-Shirt mit Rundhalsausschnitt.',
 'Hose mit Print und Taschen.',
 'Langärmliges Shirt mit Streifen.',
 'Ein Kleid mit Blumen.',
 'Kurze Shorts mit Taschen.']

In [24]:
# convert array to list
np.ndarray.tolist(X_val)

['Culotte mit weitem Bein.',
 'Trägertop in rosa.',
 'Schlaghose aus den 70ern.',
 'Edles Kleid aus Satin.']

In [25]:
# convert array to list
np.ndarray.tolist(X_test)

['Kurze Ärmel und weite Passform machen das Shirt zum Allrounder.',
 'Top mit Print.',
 'Dicker Stoff macht die Hose zum idealen Winterbegleiter.',
 'Gestricktes Kleid mit V-Ausschnitt.',
 'Ausfallendes Kleid in lila.',
 'Kurze Jeans mit Strassapllikation.',
 'Lang geschnittenes Shirt aus Wolle.']

In [27]:
# convert array to list
np.ndarray.tolist(y_train)

[2, 0, 2, 1, 0, 1, 0, 2, 0, 1, 2]

In [28]:
# convert array to list
np.ndarray.tolist(y_val)

[2, 0, 2, 1]

In [29]:
# convert array to list
np.ndarray.tolist(y_test)

[0, 0, 2, 1, 1, 2, 0]

### Tokenization and data encoding

In [31]:
# initilalize the tokenizer
checkpoint = "deepset/gbert-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer

Downloading:   0%|          | 0.00/83.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/362 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/240k [00:00<?, ?B/s]

PreTrainedTokenizerFast(name_or_path='deepset/gbert-base', vocab_size=31102, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [32]:
# define a function for encoding
def encode(docs):
    '''
    This function takes list of texts and returns input_ids and attention_mask of texts
    '''
    encoded_dict = tokenizer.batch_encode_plus(docs, add_special_tokens=True, max_length=20, padding='max_length',
                            return_attention_mask=True, truncation=True, return_tensors='pt')
    input_ids = encoded_dict['input_ids']
    attention_masks = encoded_dict['attention_mask']
    return input_ids, attention_masks

In [33]:
# usage of the 'encode' function
train_input_ids, train_att_masks = encode(X_train.tolist())
valid_input_ids, valid_att_masks = encode(X_val.tolist())
test_input_ids, test_att_masks = encode(X_test.tolist())

In [35]:
# create tensors of labels
labels_train = torch.tensor(y_train)

In [37]:
labels_val = torch.tensor(y_val)

In [39]:
labels_test = torch.tensor(y_test)

In [41]:
# put the dataset together
dataset_train = TensorDataset(train_input_ids, train_att_masks, labels_train)
dataset_val = TensorDataset(valid_input_ids, valid_att_masks, labels_val)
dataset_test = TensorDataset(test_input_ids, test_att_masks, labels_test)

#### Get the tokens from the token ids to use later

In [44]:
# Get the tokens for each sample in the training set
train_tokens = []
for input_id in train_input_ids:
    train_tokens.append(tokenizer.convert_ids_to_tokens(input_id.tolist()))

# Get the tokens for each sample in the validation set
valid_tokens = []
for input_id in valid_input_ids:
    valid_tokens.append(tokenizer.convert_ids_to_tokens(input_id.tolist()))
    
# Get the tokens for each sample in the test set
test_tokens = []
for input_id in test_input_ids:
    test_tokens.append(tokenizer.convert_ids_to_tokens(input_id.tolist()))

## Transformer model

In [48]:
len(label_dict)

3

In [50]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,
                                                      num_labels=len(label_dict),
                                                      output_attentions=True,
                                                      output_hidden_states=False)

Downloading:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

## Data Loaders

In [51]:
#from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 3

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), #Doku lesen ob bei seq sampler die reihenfolge gleich
                                   batch_size=batch_size)

dataloader_test = DataLoader(dataset_test, 
                                   sampler=SequentialSampler(dataset_test), #Doku lesen ob bei seq sampler die reihenfolge gleich
                                   batch_size=batch_size)

## Optimizers and schedulers

In [54]:
#from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = torch.optim.AdamW(model.parameters())#,
                  #lr=5e-5, #1e-5
                  #eps=1e-8)
                  
epochs = 5

#scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            #num_warmup_steps=0,
                                           # num_training_steps=len(dataloader_train)*epochs)

#This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead

## Define Performance Metrics

In [55]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [56]:
from sklearn.metrics import accuracy_score

def overallaccuracy(labels, preds):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return accuracy_score(labels_flat, preds_flat)


In [57]:
# MCC
from sklearn.metrics import matthews_corrcoef

def mcc_score_func(labels, preds):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return matthews_corrcoef(labels_flat, preds_flat)


In [58]:

def accuracy_per_class_df(preds, labels, label_dict):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    acc_df = pd.DataFrame(columns=["class", "correct_preds", "total_preds"])

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        class_name = label_dict_inverse[label]
        correct_preds = len(y_preds[y_preds==label])
        total_preds = len(y_true)
        acc_df = acc_df.append({"class": class_name, "correct_preds": correct_preds, "total_preds": total_preds}, ignore_index=True)
    
    return acc_df

## Training Loop

In [59]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [60]:
model = model.cuda()

In [61]:
from collections import defaultdict
misclassified_examples_dict = defaultdict(list)

In [62]:
len(misclassified_examples_dict)

0

In [63]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def evaluate(dataloader_val): 

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    global attention_weights
    attention_weights = []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0], 
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
        #ATTENTION WEIGHTS    
        attention_weights_final_layer = outputs.attentions[11].cpu().detach().numpy()
        for sampleindex in range(len(inputs['input_ids'])):
            sample_attention_weights = []
            for keyindex in range(len(attention_weights_final_layer[sampleindex, 0, 0])):
                x = attention_weights_final_layer[sampleindex, :, 0, keyindex].mean() 
                sample_attention_weights.append(x)
            attention_weights.append(sample_attention_weights)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

 
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
    
    preds = np.argmax(predictions, axis=1)
    misclassified = np.where(preds != true_vals)[0]
    

    
    texts = X_val.tolist()
    
    global misclassified_examples
    misclassified_examples = []
    
    global misclassified_examples_dict

    misclassified_examples_dict = defaultdict(list)
    val_indices = np.arange(len(X_val))

    for idx in misclassified: 
        text = texts[idx]
        true_label = true_vals[idx]
        pred_label = preds[idx]
        val_index = val_indices[idx]
        misclassified_examples.append({
            'text': text,
            'true_label': true_label,
            'pred_label': pred_label,
            'val_index': val_index
        })

            
    return loss_val_avg, predictions, true_vals

step_count = 0
train_loss_per_step = [] 
val_loss_per_step = [] 
val_acc_per_step = []

train_losses = [] 
val_losses = []
overall_accuracy = []


for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:
        
#

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        #scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
        
        step_count += 1  
        
        if step_count % 3 == 0:  
            loss_train_avg_per_step = loss_train_total/len(dataloader_train) 
            train_loss_per_step.append((step_count,loss_train_avg_per_step)) 
            
    
    loss_train_avg = loss_train_total/len(dataloader_train) 
    train_losses.append(loss_train_avg) 

    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation) 
    
    if step_count % 3 == 0:  
            loss_val_per_step = val_loss 
            val_loss_per_step.append((step_count,loss_val_per_step)) 
            
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')
    
    val_overallaccuracy = overallaccuracy(true_vals, predictions)
    tqdm.write(f'Overall Accuracy: {val_overallaccuracy}')
    
    if step_count % 3 == 0:  
            acc_per_step = val_loss 
            val_acc_per_step.append((step_count,val_overallaccuracy))
    
    val_mcc = mcc_score_func(true_vals, predictions)
    tqdm.write(f'MCC: {val_mcc}')
    
    val_acc = accuracy_per_class(predictions, true_vals)
    tqdm.write(f'Accuracy per class: {val_acc}')
    
    val_losses.append(val_loss) 
    overall_accuracy.append(val_overallaccuracy) 
    
    
    global val_acc_df
    val_acc_df = accuracy_per_class_df(predictions, true_vals, label_dict)



  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/4 [00:00<?, ?it/s]


Epoch 1
Training loss: 2.0209629237651825
Validation loss: 1.0202228128910065
F1 Score (Weighted): 0.1
Overall Accuracy: 0.25
MCC: 0.0
Class: Tshirt
Accuracy: 0/1

Class: Kleid
Accuracy: 1/1

Class: Hose
Accuracy: 0/2

Accuracy per class: None


Epoch 2:   0%|          | 0/4 [00:00<?, ?it/s]


Epoch 2
Training loss: 2.479090854525566
Validation loss: 1.3147750794887543
F1 Score (Weighted): 0.3333333333333333
Overall Accuracy: 0.5
MCC: 0.0
Class: Tshirt
Accuracy: 0/1

Class: Kleid
Accuracy: 0/1

Class: Hose
Accuracy: 2/2

Accuracy per class: None


Epoch 3:   0%|          | 0/4 [00:00<?, ?it/s]


Epoch 3
Training loss: 1.695324033498764
Validation loss: 1.726254403591156
F1 Score (Weighted): 0.1
Overall Accuracy: 0.25
MCC: 0.0
Class: Tshirt
Accuracy: 1/1

Class: Kleid
Accuracy: 0/1

Class: Hose
Accuracy: 0/2

Accuracy per class: None


Epoch 4:   0%|          | 0/4 [00:00<?, ?it/s]


Epoch 4
Training loss: 1.1695346981287003
Validation loss: 1.174277514219284
F1 Score (Weighted): 0.3333333333333333
Overall Accuracy: 0.5
MCC: 0.0
Class: Tshirt
Accuracy: 0/1

Class: Kleid
Accuracy: 0/1

Class: Hose
Accuracy: 2/2

Accuracy per class: None


Epoch 5:   0%|          | 0/4 [00:00<?, ?it/s]


Epoch 5
Training loss: 1.053799569606781
Validation loss: 1.0717858672142029
F1 Score (Weighted): 0.3333333333333333
Overall Accuracy: 0.5
MCC: 0.0
Class: Tshirt
Accuracy: 0/1

Class: Kleid
Accuracy: 0/1

Class: Hose
Accuracy: 2/2

Accuracy per class: None


### Attention weights

In [70]:
# transform list to df
misclassifications = pd.DataFrame(misclassified_examples)

# set the maximum column width to a large number to display full text
pd.set_option('display.max_colwidth', None) #alternativ statt none eine Zahl um die Anzahl an Zeichen die geprintet werden sollen festzulegen

# add a new column to the DataFrame with the token information
tokens_series = pd.Series(valid_tokens, name='tokens')

# only join tokens to the rows that have a matching val_index in df
misclassifications = pd.merge(misclassifications, tokens_series, left_on='val_index', right_index=True, how='left')

# you can set the index column to be the val_index column
misclassifications.set_index('val_index', inplace=True)

# add a new column to the DataFrame with the attention weights
weights_series = pd.Series(attention_weights, name='attention_weights')

# only join weights to the rows that have a matching index in df
misclassifications = pd.merge(misclassifications, weights_series, left_index=True, right_index=True, how='left')

# view the resulting dataframe
misclassifications

Unnamed: 0_level_0,text,true_label,pred_label,tokens,attention_weights
val_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Trägertop in rosa.,0,2,"[[CLS], Träger, ##top, in, ro, ##sa, ., [SEP], [PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PAD]]","[0.124999814, 0.12500016, 0.12499993, 0.1250001, 0.12500006, 0.12499984, 0.12500003, 0.12500009, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
3,Edles Kleid aus Satin.,1,2,"[[CLS], Ed, ##les, Kleid, aus, Sat, ##in, ., [SEP], [PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PAD]]","[0.111110836, 0.11111111, 0.11111096, 0.11111129, 0.111111276, 0.111110985, 0.1111111, 0.11111119, 0.111111216, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
