In [None]:
from datasets import load_from_disk, load_metric
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig, Trainer
import numpy as np
import pyarrow as pa
import pandas as pd

import os
import sys
sys.path.insert(0, '/zhome/a6/6/127219/Speciale/master_project')
from src.models.transformers_modeling_roberta import RobertaForSequenceClassification_fromTransformersLinear, RobertaForSequenceClassification_fromTransformers

In [None]:
# load
datadir = '/work3/s174498/sst2_dataset/'
checkpoint_lin = "/work3/s174498/final/Prob_linear_head/checkpoint-2500"
checkpoint_ori = "/work3/s174498/final/Prob_original_head/checkpoint-1500"

# test data
test_dataset = load_from_disk(datadir + 'test_dataset')

# tokenizer
tokenizer_lin = RobertaTokenizer.from_pretrained(checkpoint_lin)
tokenizer_lin.model_max_len=512

tokenizer_ori = RobertaTokenizer.from_pretrained(checkpoint_ori)
tokenizer_ori.model_max_len=512

# model
config_lin = RobertaConfig.from_pretrained(checkpoint_lin)
config_lin.output_hidden_states = True

config_ori = RobertaConfig.from_pretrained(checkpoint_ori)
config_ori.output_hidden_states = True

model_lin = RobertaForSequenceClassification_fromTransformers.from_pretrained(checkpoint_lin, config=config_lin)
model_ori = RobertaForSequenceClassification_fromTransformers.from_pretrained(checkpoint_ori, config=config_ori)

In [None]:
# Prepare the text inputs for the model
def preprocess_function(examples):
    return tokenizer_lin(examples["sentence"], truncation=True, max_length=512)

In [None]:
tokenized_test_lin = test_dataset.map(preprocess_function, batched=True)

In [None]:
trainer = Trainer(
    model=model_lin,                        
    tokenizer=tokenizer_lin
)

In [None]:
# Predicting with model
predictions = trainer.predict(tokenized_test_lin)


In [None]:
pred_dataset_test = list(np.argmax(predictions.predictions[0], axis=-1))
true_dataset_test = predictions.label_ids

In [None]:
# compute accuracy (LINEAR)
accuracy_metric = load_metric("accuracy")
accuracy_metric.compute(predictions=pred_dataset_test, references=true_dataset_test)

In [None]:
# Prepare the text inputs for the model
def preprocess_function(examples):
    return tokenizer_ori(examples["sentence"], truncation=True, max_length=512)

In [None]:
tokenized_test_ori = test_dataset.map(preprocess_function, batched=True)

In [None]:
trainer = Trainer(
    model=model_ori,                        
    tokenizer=tokenizer_ori
)

In [None]:
# Predicting with model
predictions = trainer.predict(tokenized_test_ori)

In [None]:
pred_dataset_test = list(np.argmax(predictions.predictions[0], axis=-1))
true_dataset_test = predictions.label_ids

In [None]:
# compute accuracy (Originally head)
accuracy_metric = load_metric("accuracy")
accuracy_metric.compute(predictions=pred_dataset_test, references=true_dataset_test)

## SST2 Data describtion

In [None]:
datadir = '/work3/s174498/sst2_dataset/'

train_dataset = load_from_disk(datadir + 'train_dataset')
validation_dataset = load_from_disk(datadir + 'validation_dataset')
test_dataset = load_from_disk(datadir + 'test_dataset')

In [None]:
df_train = pd.DataFrame({'Sentences':train_dataset['sentence']})

word_in_sentence = []
for i in range(len(df_train['Sentences'])):
    word_in_sentence.append(len(df_train['Sentences'][i].split()))

df_train['words in sentences'] = word_in_sentence

In [None]:
df_validation = pd.DataFrame({'Sentences':validation_dataset['sentence']})

word_in_sentence = []
for i in range(len(df_validation['Sentences'])):
    word_in_sentence.append(len(df_validation['Sentences'][i].split()))

df_validation['words in sentences'] = word_in_sentence

In [None]:
df_test = pd.DataFrame({'Sentences':test_dataset['sentence']})

word_in_sentence = []
for i in range(len(df_test['Sentences'])):
    word_in_sentence.append(len(df_test['Sentences'][i].split()))

df_test['words in sentences'] = word_in_sentence

In [None]:
from transformers import RobertaTokenizer
tokenizer_pretrained = RobertaTokenizer.from_pretrained('roberta-base')

In [None]:
tokens_random = []
for i in list(df_train['Sentences'].index):
    numb_tokens = len(tokenizer_pretrained.encode(df_train['Sentences'][i]))
    tokens_random.append(numb_tokens)

tokens_woman = []
for i in list(df_validation['Sentences'].index):
    numb_tokens = len(tokenizer_pretrained.encode(df_validation['Sentences'][i]))
    tokens_woman.append(numb_tokens)

tokens_man = []
for i in list(df_test['Sentences'].index):
    numb_tokens = len(tokenizer_pretrained.encode(df_test['Sentences'][i]))
    tokens_man.append(numb_tokens)


TRAIN

In [None]:
print(np.mean(tokens_random))
print(np.min(tokens_random))
print(np.max(tokens_random))

VALIDATION

In [None]:
print(np.mean(tokens_woman))
print(np.min(tokens_woman))
print(np.max(tokens_woman))

TEST

In [None]:
print(np.mean(tokens_man))
print(np.min(tokens_man))
print(np.max(tokens_man))