# Training script

### Imports

In [25]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier
from transformers import AutoTokenizer
from transformers import RobertaForTokenClassification
from datasets import load_dataset
from torch import nn
import spacy
import nltk
from tqdm import tqdm
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.feature_selection import f_classif, SelectKBest
import string
import fasttext
import fasttext.util
from sklearn.svm import SVC
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support, confusion_matrix
from sklearn.preprocessing import normalize
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
import seaborn as sns
import os
import sys
import random
import pickle

sys.path.append(os.path.abspath('../..'))
from util import generate_metrics_latex_table


In [26]:
def set_seed(seed_value):
    """Set seed for reproducibility."""
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    # for multi-GPU
    torch.cuda.manual_seed_all(seed_value) 
    # or starting nondeterministic operations in PyTorch
    if seed_value is not None:
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

seed = 13
set_seed(seed)

In [27]:
if not os.path.exists('../cc.en.300.bin'):
    fasttext.util.download_model('en', if_exists='ignore')
embedder = fasttext.load_model('../cc.en.300.bin')
nlp = spacy.load("en_core_web_lg")




In [28]:
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
print(f'{device=}')

device=device(type='mps')


### Models

In [29]:
nr_features = 300 # the number of top-scoring features that will be selected ranked by ANOVA score
model = SVC() # Replace model instantiation with another class here (SVC for example) if wishing to test other models
use_history = False # whether to use the history of the previous sentences


### Loading the dataset

In [30]:
def process_intent_list(intent_list):
    intents = set()
    if len(intent_list) == 0:
        intents.add('other')
    for intent in intent_list:
        if intent.startswith('Restaurant'):
            intents.add(intent)
        elif intent.startswith('Hotel'):
            intents.add(intent)
        elif intent.startswith('general'):
            intents.add(intent)
        else:
            intents.add('other')
    # print(f'Original {intent_list}')
    # print(f'Modified {list(intents)}')
    return sorted(intents)

def preprocess_split(dataset, split):
    df = dataset[split].to_pandas()
    new_df = pd.DataFrame(columns = df.columns)
    for i in range(len(df)):
        # Taken from notebook, to know which lines to skip
        row = df.loc[i]
        if not any(set(row.turns['frames'][turn_id]['service']).intersection(['hotel', 'restaurant']) for turn_id,utt in enumerate(row.turns['utterance'])):
            continue
        new_df.loc[len(new_df)] = row
        # new_df.loc[len(new_df) - 1]['services'] = process_service_list(new_df.loc[len(new_df) - 1]['services'])
        # for i, frame_service in [frame['service'] for frame in df.loc[i].turns['frames']]:
            # df.loc[i].turns['frames']
    return new_df

def extract_feature_df(dataset):
    act_types = []
    utterance_list = []
    embedding_list = []
    
    for i in tqdm(range(len(dataset))):
        turns = dataset.loc[i].turns
        # print(dataset.loc[i].turns['utterance'])
        # print([frame['service'] for frame in dataset.loc[i].turns['frames']])
        for j, (utterance, speaker, dialogue_act) in enumerate(zip(turns['utterance'], turns['speaker'], turns['dialogue_acts'])):
            if j == 0:
                prev_user_utterance = ''
                prev_user_acts = []
                prev_bot_utterance = ''
                prev_bot_acts = []
            else:
                prev_user_utterance = turns['utterance'][j - 2]
                prev_user_acts = turns['dialogue_acts'][j - 2]['dialog_act']['act_type']
                prev_bot_utterance = turns['utterance'][j - 1]
                prev_bot_acts = turns['dialogue_acts'][j - 1]['dialog_act']['act_type']
            
            if speaker == 0: # if it's the user's turn
                if use_history:
                    composed_utterance = ' | '.join([prev_user_utterance, ', '.join(prev_user_acts), prev_bot_utterance, ', '.join(prev_bot_acts), utterance])
                else:
                    composed_utterance = utterance
                # print(composed_utterance)
                
                # utterance = composed_utterance.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
                utterance = composed_utterance
                act_type = dialogue_act['dialog_act']['act_type']
                new_utterance = []
                word_embedding_list = []
                
                # doc = nlp(utterance)
                # for token in doc:
                #     new_utterance.append(token.lemma_)
                #     word_embedding_list.append(embedder.get_word_vector(token.lemma_))
                
                # embedding = np.mean(word_embedding_list, axis = 0)
                # print(composed_utterance)
                embedding = embedder.get_sentence_vector(composed_utterance)
                # embedding = np.stack(word_embedding_list)
                
                new_utterance = ' '.join(new_utterance)
                
                embedding_list.append(embedding)
                act_types.append(process_intent_list(act_type))
                utterance_list.append(utterance)
    
    tf_idf = TfidfVectorizer().fit(utterance_list)
                    
    return tf_idf, utterance_list, embedding_list, act_types

In [31]:
dataset = load_dataset('multi_woz_v22')

try:
    train
    print("Dataset already loaded, moving on")
except:
    train = preprocess_split(dataset, 'train')
    test = preprocess_split(dataset, 'test')
    val = preprocess_split(dataset, 'validation')
    tf_idf, train_utterance_list, train_embedding_list, train_act_type = extract_feature_df(train)
    _, test_utterance_list, test_embedding_list, test_act_type = extract_feature_df(test)
    _, val_utterance_list, val_embedding_list, val_act_type = extract_feature_df(test)
    del embedder

No config specified, defaulting to: multi_woz_v22/v2.2_active_only
Found cached dataset multi_woz_v22 (/Users/pepe/.cache/huggingface/datasets/multi_woz_v22/v2.2_active_only/2.2.0/6719c8b21478299411a0c6fdb7137c3ebab2e6425129af831687fb7851c69eb5)


  0%|          | 0/3 [00:00<?, ?it/s]

Dataset already loaded, moving on


In [32]:
model = MultiOutputClassifier(model)
mlb = MultiLabelBinarizer().fit(train_act_type)

train_tf_idf_feats = tf_idf.transform(train_utterance_list)
train_feats = np.concatenate([train_tf_idf_feats.toarray(), np.stack(train_embedding_list)], axis = 1)
print(train_feats.shape)
# train_feats = np.stack(train_embedding_list) UNCOMMENT TO ONLY USE EMBEDDINGS

train_labels = mlb.transform(train_act_type)
print(train_labels)
print(train_labels[0])
feature_scores = [] 
for i in range(train_labels.shape[1]):
    selector = SelectKBest(f_classif, k='all')
    selector.fit(train_feats, train_labels[:, i])
    feature_scores.append(list(selector.scores_))
feature_scores = np.mean(feature_scores, axis=0)
selected_features = np.argpartition(feature_scores, -nr_features)[-nr_features:]
train_feats_selected = train_feats[:, selected_features]

print(f'{train_feats_selected.shape[1]} from {train_feats.shape[1]} features selected')

val_tf_idf_feats = tf_idf.transform(val_utterance_list)
val_feats = np.concatenate([val_tf_idf_feats.toarray(), np.stack(val_embedding_list)], axis = 1)
val_feats_selected = val_feats[:, selected_features]
val_labels = mlb.transform(val_act_type)

model.fit(train_feats_selected, train_labels)

(45794, 3575)
[[0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [0 0 1 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[0 0 1 0 0 0 0 0]
300 from 3575 features selected


In [33]:
test_tf_idf_feats = tf_idf.transform(test_utterance_list)
test_feats = np.concatenate([test_tf_idf_feats.toarray(), np.stack(test_embedding_list)], axis = 1)
# test_feats = np.stack(test_embedding_list) UNCOMMENT TO ONLY USE EMBEDDINGS
test_feats_selected = test_feats[:, selected_features]

logits = model.predict(test_feats_selected)
test_act_type_pred_probs = torch.sigmoid(torch.tensor(logits)) 
test_act_type_pred = (test_act_type_pred_probs > 0.5)

acc = accuracy_score(mlb.transform(test_act_type), test_act_type_pred)
precision, recall, f_score, _ = precision_recall_fscore_support(mlb.transform(test_act_type), test_act_type_pred, average = 'micro')
print(f'acc = {acc}, precision = {precision}, recall = {recall}, f_score = {f_score}')

acc = 0.7551831375259157, precision = 0.9312560856864655, recall = 0.7724115651752544, f_score = 0.8444287480134205


In [34]:
report = classification_report(mlb.transform(test_act_type), test_act_type_pred, target_names=mlb.classes_, digits=3)
print(report)

                    precision    recall  f1-score   support

      Hotel-Inform      0.933     0.803     0.863      1328
     Hotel-Request      0.795     0.226     0.352       292
 Restaurant-Inform      0.914     0.834     0.872      1322
Restaurant-Request      0.644     0.304     0.413       286
       general-bye      0.962     0.911     0.936       225
     general-greet      0.000     0.000     0.000         6
     general-thank      0.957     0.935     0.946       693
             other      0.958     0.789     0.865      2039

         micro avg      0.931     0.772     0.844      6191
         macro avg      0.771     0.600     0.656      6191
      weighted avg      0.920     0.772     0.832      6191
       samples avg      0.792     0.789     0.787      6191



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
# Save the model
if not os.path.isdir('model'):
    os.mkdir('model')
torch.save(model, f'model/01_intent_SVM_{"with" if use_history else "no"}_history_top_{nr_features}_features.pt')
with open('model/mlb.pkl', 'wb') as mlb_f, open('model/model.pkl', 'wb') as model_f:
    pickle.dump(mlb, mlb_f)
    pickle.dump(model, model_f)

In [36]:
generate_metrics_latex_table(model_name=f'SVM (rbf kernel, {"with" if use_history else "withou"} history)', task_number=1, true_labels=mlb.transform(test_act_type), binary_predictions=test_act_type_pred, prediction_probs=test_act_type_pred_probs, target_names=mlb.classes_)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
