# Training script

### Imports

In [1]:
import pandas as pd
from tqdm import tqdm
import torch
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer, AdamW
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, Dataset
from sklearn.preprocessing import MultiLabelBinarizer, normalize
from torch import cuda
import random
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support, classification_report, multilabel_confusion_matrix, ConfusionMatrixDisplay
import numpy as np
from sklearn.metrics import coverage_error, label_ranking_average_precision_score, label_ranking_loss
from datasets import load_dataset
import seaborn as sns
import matplotlib.pyplot as plt
import sys
import os
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier
from transformers import AutoTokenizer, RobertaModel
from datasets import load_dataset
from torch import nn
import spacy
import nltk
from tqdm import tqdm
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from os.path import exists

sys.path.append(os.path.abspath('../..'))
from util import generate_metrics_latex_table

nlp = spacy.load("en_core_web_sm")

if exists('../../word2vec.model'):
    word2vec = Word2Vec.load('../../word2vec.model')
else:
    word2vec = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)
    word2vec.save('../../word2vec.model')

In [2]:
def set_seed(seed_value):
    """Set seed for reproducibility."""
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    # for multi-GPU
    torch.cuda.manual_seed_all(seed_value) 
    # or starting nondeterministic operations in PyTorch
    if seed_value is not None:
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

seed = 13
set_seed(seed)

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
print(device)

mps


### Loading the dataset

In [4]:
def process_service_list(service_list):
    services = set()
    if len(service_list) == 0:
        services.add('other')
    for service in service_list:
        if service == 'restaurant':
            services.add('restaurant')
        elif service == 'hotel':
            services.add('hotel')
        else:
            services.add('other')
        if len(services) == 3:
            break
    return sorted(services)

def preprocess_split(dataset, split):
    df = dataset[split].to_pandas()
    new_df = pd.DataFrame(columns = df.columns)
    for i in range(len(df)):
        # Taken from notebook, to know which lines to skip
        row = df.loc[i]
        if not any(set(row.turns['frames'][turn_id]['service']).intersection(['hotel', 'restaurant']) for turn_id,utt in enumerate(row.turns['utterance'])):
            continue
        new_df.loc[len(new_df)] = row
        # new_df.loc[len(new_df) - 1]['services'] = process_service_list(new_df.loc[len(new_df) - 1]['services'])
        # for i, frame_service in [frame['service'] for frame in df.loc[i].turns['frames']]:
            # df.loc[i].turns['frames']
    return new_df

def extract_utterance_features(utterance):
    doc = nlp(utterance)
    
    for token in doc:
        print(token)
    
    while True:
        pass

def extract_feature_df(dataset, default_freqs = {}):
    feats_list = []
    act_types = []
    utterance_list = []
    
    for i in tqdm(range(len(dataset))):
        turns = dataset.loc[i].turns
        # print(dataset.loc[i].turns['utterance'])
        # print([frame['service'] for frame in dataset.loc[i].turns['frames']])
        for utterance, speaker, dialogue_act in zip(turns['utterance'], turns['speaker'], turns['dialogue_acts']):
            if speaker == 0: # if it's the user's turn
                act_type = dialogue_act['dialog_act']['act_type']
                token_pos_list = []
                
                doc = nlp(utterance)
                for token in doc:
                    if not token.is_punct and not token.is_stop:
                        token_pos_list.append(token.pos_)
                
                # USING FREQDIST
                feats = {}
                # feats.update(nltk.FreqDist(nltk.bigrams(token_text_list)))
                # feats.update(nltk.FreqDist(nltk.trigrams(token_text_list)))
                feats.update(nltk.FreqDist(nltk.bigrams(token_pos_list)))
                # feats.update(nltk.FreqDist(nltk.bigrams(token_pos_list)))
                # feats.update(nltk.FreqDist(nltk.trigrams(token_pos_list)))
                act_types.append(act_type)
                
                if len(default_freqs) != 0:
                    feats = {k : (feats[k] if k in feats else 0) for k in default_freqs.keys()}
                feats_list.append(feats)
                utterance_list.append(utterance)
    
    tf_idf = TfidfVectorizer().fit(utterance_list)
                    
    return tf_idf, utterance_list, pd.DataFrame.from_records(feats_list).fillna(0), act_types

In [5]:
dataset = load_dataset('multi_woz_v22')

train = preprocess_split(dataset, 'train')
test = preprocess_split(dataset, 'test')
val = preprocess_split(dataset, 'validation')


tf_idf, train_utterance_list, train_freqs, train_act_type = extract_feature_df(train)
default_freqs = {k : 0 for k in train_freqs.columns}

print(len(set(act for act_type in train_act_type for act in act_type))) 

_, test_utterance_list, test_freqs, test_act_type = extract_feature_df(test, default_freqs)

No config specified, defaulting to: multi_woz_v22/v2.2_active_only
Found cached dataset multi_woz_v22 (/Users/pepe/.cache/huggingface/datasets/multi_woz_v22/v2.2_active_only/2.2.0/6719c8b21478299411a0c6fdb7137c3ebab2e6425129af831687fb7851c69eb5)


  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 6321/6321 [05:42<00:00, 18.48it/s]  


17


100%|██████████| 745/745 [00:23<00:00, 31.27it/s]


In [6]:
rfc = XGBClassifier(n_estimators = 100, max_depth = 7, learning_rate = 0.001)
rfc = MultiOutputClassifier(rfc)
mlb = MultiLabelBinarizer().fit(train_act_type)

train_tf_idf_feats = tf_idf.transform(train_utterance_list)
train_feats = np.concatenate([train_tf_idf_feats.toarray(), train_freqs.to_numpy()], axis = 1)
# train_feats = train_tf_idf_feats.toarray()
# train_feats = train_freqs.to_numpy()
print(train_feats.shape)

rfc.fit(train_feats, mlb.transform(train_act_type))

(45794, 3420)


In [7]:
test_tf_idf_feats = tf_idf.transform(test_utterance_list)
test_feats = np.concatenate([test_tf_idf_feats.toarray(), test_freqs.to_numpy()], axis = 1)
# test_feats = test_tf_idf_feats.toarray()
# test_feats = test_freqs.to_numpy()

test_act_type_pred = rfc.predict(test_feats)
test_act_type_pred_probs = rfc.predict_proba(test_feats)

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

acc = accuracy_score(mlb.transform(test_act_type), test_act_type_pred)
precision, recall, f_score, _ = precision_recall_fscore_support(mlb.transform(test_act_type), test_act_type_pred, average = 'macro')
print(f'acc = {acc}, precision = {precision}, recall = {recall}, f_score = {f_score}')

acc = 0.01624049758120249, precision = 0.0, recall = 0.0, f_score = 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
# Save the model
if not os.path.isdir('model'):
    os.mkdir('model')
torch.save(rfc.state_dict(), f'model/01_intent_xgbclassifier.pt')

AttributeError: 'MultiOutputClassifier' object has no attribute 'state_dict'

In [None]:
generate_metrics_latex_table(model_name='XGBClassifier', task_number=1, true_labels=mlb.transform(test_act_type), binary_predictions=test_act_type_pred, prediction_probs=test_act_type_pred_probs, target_names=mlb.classes_)