# Training script

### Imports

In [1]:
import pandas as pd
from xgboost import XGBRFClassifier
from sklearn.multioutput import MultiOutputClassifier
from transformers import AutoTokenizer, RobertaModel
from datasets import load_dataset
from torch import nn
import spacy
import nltk
from tqdm import tqdm
from sklearn.preprocessing import MultiLabelBinarizer

nlp = spacy.load("en_core_web_sm")

### Constants

In [2]:
TRANSFORMER_MODEL_NAME = 'roberta-base'

### Loading the dataset

In [3]:
def process_service_list(service_list):
    services = set()
    if len(service_list) == 0:
        services.add('other')
    for service in service_list:
        if service == 'restaurant':
            services.add('restaurant')
        elif service == 'hotel':
            services.add('hotel')
        else:
            services.add('other')
        if len(services) == 3:
            break
    return list(services)

def preprocess_split(dataset, split):
    df = dataset[split].to_pandas()
    new_df = pd.DataFrame(columns = df.columns)
    for i in range(len(df)):
        # Taken from notebook, to know which lines to skip
        row = df.loc[i]
        if not any(set(row.turns['frames'][turn_id]['service']).intersection(['hotel', 'restaurant']) for turn_id,utt in enumerate(row.turns['utterance'])):
            continue
        new_df.loc[len(new_df)] = row
        # new_df.loc[len(new_df) - 1]['services'] = process_service_list(new_df.loc[len(new_df) - 1]['services'])
        # for i, frame_service in [frame['service'] for frame in df.loc[i].turns['frames']]:
            # df.loc[i].turns['frames']
    return new_df

def extract_utterance_features(utterance):
    doc = nlp(utterance)
    
    for token in doc:
        print(token)
    
    while True:
        pass

def extract_feature_df(dataset, default_feats = {}):
    feats_list = []
    act_types = []
    
    for i in tqdm(range(len(dataset))):
        turns = dataset.loc[i].turns
        # print(dataset.loc[i].turns['utterance'])
        # print([frame['service'] for frame in dataset.loc[i].turns['frames']])
        for utterance, speaker, dialogue_act in zip(turns['utterance'], turns['speaker'], turns['dialogue_acts']):
            if speaker == 0: # if it's the user's turn
                act_type = dialogue_act['dialog_act']['act_type']
                
                token_text_list = []
                token_pos_list = []
                
                doc = nlp(utterance)
                for token in doc:
                    if not token.is_punct and not token.is_stop:
                        token_text_list.append(token.text)
                        token_pos_list.append(token.pos_)
                
                # USING FREQDIST
                feats = {}
                feats.update(nltk.FreqDist(token_text_list))
                # feats.update(nltk.FreqDist(nltk.bigrams(token_text_list)))
                # feats.update(nltk.FreqDist(nltk.trigrams(token_text_list)))
                feats.update(nltk.FreqDist(token_pos_list))
                # feats.update(nltk.FreqDist(nltk.bigrams(token_pos_list)))
                # feats.update(nltk.FreqDist(nltk.trigrams(token_pos_list)))
                act_types.append(act_type)
                
                if len(default_feats) != 0:
                    feats = {k : (feats[k] if k in feats else 0) for k in default_feats.keys()}
                
                feats_list.append(feats)

                # USING LISTS
                # df['unigrams'].append(token_text_list)
                # act_types.append(act_type)
                
    return pd.DataFrame.from_records(feats_list).fillna(0), act_types

In [4]:
dataset = load_dataset('multi_woz_v22')
print(dataset)

train = preprocess_split(dataset, 'train')
test = preprocess_split(dataset, 'test')
val = preprocess_split(dataset, 'validation')

da_labels = sorted(list(set([act_type for i in range(len(train))
                                        for da, speaker in zip(train.loc[i].turns['dialogue_acts'], train.loc[i].turns['speaker'])
                                            for act_type in da['dialog_act']['act_type']
                                                if speaker == 0])))

train_feats, train_act_type = extract_feature_df(train)
default_feats = {k : 0 for k in train_feats.columns}
print(len(default_feats))

test_feats, test_act_type = extract_feature_df(test, default_feats)

# train = pd.DataFrame(columns=dataset['train'].to_pandas().columns)
# test = pd.DataFrame()

# for _, row in dataset['train'].to_pandas().iterrows():
#     # print(row['services'])
#     if len(row['services']) != 0 and set(row['services']) <= set(['restaurant', 'hotel']):
#         train.loc[len(train)] = row
        
# print(train.loc[0].turns['frames'][0])
# print(train.loc[0])
train_feats

No config specified, defaulting to: multi_woz_v22/v2.2_active_only
Found cached dataset multi_woz_v22 (/home/adrian/.cache/huggingface/datasets/multi_woz_v22/v2.2_active_only/2.2.0/6719c8b21478299411a0c6fdb7137c3ebab2e6425129af831687fb7851c69eb5)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['dialogue_id', 'services', 'turns'],
        num_rows: 8437
    })
    validation: Dataset({
        features: ['dialogue_id', 'services', 'turns'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['dialogue_id', 'services', 'turns'],
        num_rows: 1000
    })
})


100%|██████████| 6321/6321 [02:33<00:00, 41.23it/s]


4124


100%|██████████| 745/745 [00:23<00:00, 31.98it/s]


Unnamed: 0,need,place,dine,center,s,expensive,VERB,NOUN,ADJ,sort,...,roles,reversed-,ID.please,brassiere,lastly,nature,peple,certre,common,accomadation
0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,2.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,6.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,2.0,3.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45789,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45790,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45791,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45792,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
test_feats

Unnamed: 0,need,place,dine,center,s,expensive,VERB,NOUN,ADJ,sort,...,roles,reversed-,ID.please,brassiere,lastly,nature,peple,certre,common,accomadation
0,0,0,0,0,0,0,3,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,3,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5783,0,0,0,0,0,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
5784,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
5785,0,0,0,0,0,0,3,1,1,0,...,0,0,0,0,0,0,0,0,0,0
5786,1,0,0,0,0,0,4,4,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
rfc = XGBRFClassifier(n_estimators = 100, max_depth = 7, learning_rate = 0.001)
rfc = MultiOutputClassifier(rfc)
mlb = MultiLabelBinarizer().fit(train_act_type)
rfc.fit(train_feats.to_numpy(), mlb.transform(train_act_type))

In [7]:
test_act_type_pred = rfc.predict(test_feats.to_numpy())
from sklearn.metrics import accuracy_score

accuracy_score(mlb.transform(test_act_type), test_act_type_pred)

0.6330338631651693

### Roberta TEST

In [8]:
class TransformerClassifier(nn.Module):
    def __init__(self, base_model_name, num_classes):
        super().__init__()
        self.transformer = RobertaModel.from_pretrained(base_model_name)
        self.pre_classifier = nn.Linear(768, 768)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(768, num_classes)
    
    def forward(self, input):
        cls = self.transformer(**input).last_hidden_state[:, 0, :]
        x = self.pre_classifier(cls)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.classifier(x)
        return x
    

In [9]:
tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)
classifier = TransformerClassifier(TRANSFORMER_MODEL_NAME, 2)


inputs = tokenizer("We love doing NLI!", return_tensors="pt")
output = classifier(inputs)
print(output.shape)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


torch.Size([1, 2])
