# Setup

In [None]:
#Data source https://github.com/NathanDuran/Switchboard-Corpus

!git clone https://github.com/NathanDuran/Switchboard-Corpus
!cp Switchboard-Corpus/swda_data .

# Modelling

In [None]:
import pandas as pd

label_df = pd.read_csv("labels.txt", sep="\t")
labels = dict(zip(label_df["Labels"], label_df["Dialogue Act"]))
n_classes = len(label_df["Labels"])
label_index = dict(zip(label_df["Labels"],range(n_classes)))
label_index_inv = {v: k for k, v in label_index.items()}

In [None]:
print(n_classes)

In [None]:
TRAIN_DATA = "swda_data/train_set.txt"
TEST_DATA = "swda_data/test_set.txt"
VAL_DATA = "swda_data/val_set.txt"

In [None]:
import pandas as pd

df_train = pd.read_csv(TRAIN_DATA, sep="|", names=["speaker", "text", "label"])
df_test = pd.read_csv(TEST_DATA, sep="|", names=["speaker", "text", "label"])
df_val = pd.read_csv(VAL_DATA, sep="|", names=["speaker", "text", "label"])

In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',  num_labels=n_classes)

In [None]:
def tokenize_dataset(df):
    return tokenizer(df["text"].tolist(), truncation=True, padding=True)

In [None]:
train_encodings = tokenize_dataset(df_train)
val_encodings =tokenize_dataset(df_val)
test_encodings =tokenize_dataset(df_test)

In [None]:
def get_labels(df):
    return df["label"].map(lambda x: label_index[x]).tolist()

In [None]:
train_labels = get_labels(df_train)
val_labels= get_labels(df_val)
test_labels = get_labels(df_test)

In [None]:
import torch

class DADataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = DADataset(train_encodings, train_labels)
val_dataset = DADataset(val_encodings, val_labels)
test_dataset = DADataset(test_encodings, test_labels)

In [None]:
import random
random.seed(10)

random_samples = random.sample(range(0, len(train_dataset)), 100)
train_dataset_small = torch.utils.data.Subset(train_dataset, random_samples)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir='./results',
  per_device_train_batch_size=8,
  per_device_eval_batch_size=8,
  save_total_limit = 2)


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model, args=training_args, train_dataset=train_dataset, eval_dataset=test_dataset
)

In [None]:
trainer.train()

In [None]:
MODEL_NAME = "intent-model_1"
trainer.save_model(MODEL_NAME)
tokenizer.save_pretrained(MODEL_NAME)