In [1]:
import snowflake.connector
import pandas as pd
from datasets import Dataset, ClassLabel
from dotenv import load_dotenv
import os
import wandb

  warn_incompatible_dep(


In [2]:
load_dotenv()

True

In [3]:
conn = snowflake.connector.connect(
    user=os.environ["SF_USER"],
    password=os.environ["SF_PWD"],
    account=os.environ["SF_ACCOUNT"],
    database=os.environ["SF_DB"],
    schema=os.environ["SF_SCHEMA"]
)

In [4]:
# query the data from Snowflake and create a Pandas dataframe
query = 'SELECT INTENT, UTT, PARTITION, SCENARIO FROM ALEXA_MASSIVE_INTENTS_RAW;' #query = 'SELECT * FROM intent_dataset;'
alexa_df = pd.read_sql(query, conn)

  alexa_df = pd.read_sql(query, conn)


In [5]:
alexa_df

Unnamed: 0,INTENT,UTT,PARTITION,SCENARIO
0,alarm_set,wake me up at five am this week,test,alarm
1,alarm_set,wake me up at nine am on friday,train,alarm
2,alarm_set,set an alarm for two hours from now,train,alarm
3,audio_volume_mute,quiet,test,audio
4,audio_volume_mute,olly quiet,train,audio
...,...,...,...,...
16516,email_query,do i have emails,train,email
16517,email_query,what emails are new,train,email
16518,email_query,do i have new emails from john,train,email
16519,email_query,has john sent me an email,test,email


In [6]:
alexa_df.drop(["SCENARIO"], axis=1, inplace=True)

In [7]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast, Trainer, TrainingArguments
from datasets import Features, Value

In [8]:
# get the list of unique labels in the 'category' column
label_list = alexa_df['INTENT'].unique().tolist()

# instantiate a ClassLabel object with the number of classes and the names of the labels
num_classes = len(label_list)

In [9]:
# create Hugging Face dataset
features = Features({"INTENT": ClassLabel(num_classes=num_classes, names=label_list),
                     "UTT": Value("string"), 
                     "PARTITION": Value("string"),})
dataset = Dataset.from_pandas(alexa_df, features=features)


In [10]:
dataset = dataset.rename_column("INTENT", "label")

In [11]:
# split dataset into train and test sets based on the value of "SOURCE"
train_dataset = dataset.filter(lambda example: example["PARTITION"] == "train")
dev_dataset = dataset.filter(lambda example: example["PARTITION"] == "dev")
test_dataset = dataset.filter(lambda example: example["PARTITION"] == "test")

Filter:   0%|          | 0/16521 [00:00<?, ? examples/s]

Filter:   0%|          | 0/16521 [00:00<?, ? examples/s]

Filter:   0%|          | 0/16521 [00:00<?, ? examples/s]

In [12]:
train_dataset

Dataset({
    features: ['label', 'UTT', 'PARTITION'],
    num_rows: 11514
})

In [14]:
max_train_samples = 500
trunc_train_dataset = train_dataset.shuffle().select(range(max_train_samples))

In [15]:
trunc_train_dataset

Dataset({
    features: ['label', 'UTT', 'PARTITION'],
    num_rows: 500
})

In [16]:
dev_dataset

Dataset({
    features: ['label', 'UTT', 'PARTITION'],
    num_rows: 2033
})

In [17]:
max_dev_samples = 90
trunc_dev_dataset = dev_dataset.shuffle().select(range(max_dev_samples))

In [18]:
trunc_dev_dataset

Dataset({
    features: ['label', 'UTT', 'PARTITION'],
    num_rows: 90
})

In [19]:
test_dataset

Dataset({
    features: ['label', 'UTT', 'PARTITION'],
    num_rows: 2974
})

In [20]:
max_test_samples = 130
trunc_test_dataset = test_dataset.shuffle().select(range(max_test_samples))
trunc_test_dataset

Dataset({
    features: ['label', 'UTT', 'PARTITION'],
    num_rows: 130
})

In [21]:
model_name = "joaobarroca/distilbert-base-uncased-finetuned-massive-intent-detection-english"

In [22]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

# load the pre-trained BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [23]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['UTT'], padding="max_length", truncation=True)

trunc_train_dataset = trunc_train_dataset.map(tokenize_function, batched=True)
trunc_dev_dataset = trunc_dev_dataset.map(tokenize_function, batched=True)
trunc_test_dataset = trunc_test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/130 [00:00<?, ? examples/s]

# Train Model

In [24]:
# load the pre-trained small BERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)

In [25]:
from datasets import load_metric
metric = load_metric("accuracy")
import numpy as np
def compute_metrics(preds):
    logits, labels = preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("accuracy")


In [26]:
# set up the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy='epoch',     # evaluate model after every epoch
    learning_rate=2e-5,              # learning rate
    per_device_train_batch_size=32,  # batch size for training
    per_device_eval_batch_size=64,   # batch size for evaluation
    num_train_epochs=30,              # number of training epochs
    weight_decay=0.01,               # weight decay
    push_to_hub=False,               # whether to upload the model checkpoint to the Hub
    logging_dir='./logs',            # directory for storing logs
    logging_steps=1,
    report_to=None #"wandb"
)

# create the Trainer object
trainer = Trainer(
    model=model,                         # the instantiated model to be trained
    args=training_args,                  # training arguments
    train_dataset=trunc_train_dataset,         # the training dataset
    eval_dataset=trunc_dev_dataset,            # the evaluation dataset
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [30]:
# start the training
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mshri-adke[0m ([33mloyalhealth[0m). Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/70 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 8.429, 'learning_rate': 1.9714285714285718e-05, 'epoch': 0.14}
{'loss': 7.7677, 'learning_rate': 1.942857142857143e-05, 'epoch': 0.29}
{'loss': 7.717, 'learning_rate': 1.9142857142857146e-05, 'epoch': 0.43}
{'loss': 7.1878, 'learning_rate': 1.885714285714286e-05, 'epoch': 0.57}
{'loss': 6.9739, 'learning_rate': 1.8571428571428575e-05, 'epoch': 0.71}
{'loss': 7.2568, 'learning_rate': 1.8285714285714288e-05, 'epoch': 0.86}
{'loss': 8.2166, 'learning_rate': 1.8e-05, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.313501358032227, 'eval_accuracy': 0.04, 'eval_runtime': 6.075, 'eval_samples_per_second': 4.115, 'eval_steps_per_second': 0.165, 'epoch': 1.0}
{'loss': 6.4522, 'learning_rate': 1.7714285714285717e-05, 'epoch': 1.14}
{'loss': 4.8362, 'learning_rate': 1.742857142857143e-05, 'epoch': 1.29}
{'loss': 6.5282, 'learning_rate': 1.7142857142857142e-05, 'epoch': 1.43}
{'loss': 5.8403, 'learning_rate': 1.6857142857142858e-05, 'epoch': 1.57}
{'loss': 4.7584, 'learning_rate': 1.6571428571428574e-05, 'epoch': 1.71}
{'loss': 5.2669, 'learning_rate': 1.6285714285714287e-05, 'epoch': 1.86}
{'loss': 4.6675, 'learning_rate': 1.6000000000000003e-05, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.936366081237793, 'eval_accuracy': 0.12, 'eval_runtime': 6.5941, 'eval_samples_per_second': 3.791, 'eval_steps_per_second': 0.152, 'epoch': 2.0}
{'loss': 5.7732, 'learning_rate': 1.5714285714285715e-05, 'epoch': 2.14}
{'loss': 3.4651, 'learning_rate': 1.542857142857143e-05, 'epoch': 2.29}
{'loss': 3.339, 'learning_rate': 1.5142857142857144e-05, 'epoch': 2.43}
{'loss': 3.8324, 'learning_rate': 1.4857142857142858e-05, 'epoch': 2.57}
{'loss': 4.5683, 'learning_rate': 1.4571428571428573e-05, 'epoch': 2.71}
{'loss': 3.9617, 'learning_rate': 1.4285714285714287e-05, 'epoch': 2.86}
{'loss': 3.1567, 'learning_rate': 1.4e-05, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.127197742462158, 'eval_accuracy': 0.12, 'eval_runtime': 6.4295, 'eval_samples_per_second': 3.888, 'eval_steps_per_second': 0.156, 'epoch': 3.0}
{'loss': 4.2363, 'learning_rate': 1.3714285714285716e-05, 'epoch': 3.14}
{'loss': 3.1839, 'learning_rate': 1.3428571428571429e-05, 'epoch': 3.29}
{'loss': 3.4415, 'learning_rate': 1.3142857142857145e-05, 'epoch': 3.43}
{'loss': 3.8086, 'learning_rate': 1.2857142857142859e-05, 'epoch': 3.57}
{'loss': 3.3967, 'learning_rate': 1.2571428571428572e-05, 'epoch': 3.71}
{'loss': 3.3031, 'learning_rate': 1.2285714285714288e-05, 'epoch': 3.86}
{'loss': 3.6235, 'learning_rate': 1.2e-05, 'epoch': 4.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.8603436946868896, 'eval_accuracy': 0.16, 'eval_runtime': 6.4147, 'eval_samples_per_second': 3.897, 'eval_steps_per_second': 0.156, 'epoch': 4.0}
{'loss': 3.1943, 'learning_rate': 1.1714285714285716e-05, 'epoch': 4.14}
{'loss': 3.5998, 'learning_rate': 1.1428571428571429e-05, 'epoch': 4.29}
{'loss': 2.5043, 'learning_rate': 1.1142857142857143e-05, 'epoch': 4.43}
{'loss': 3.0056, 'learning_rate': 1.0857142857142858e-05, 'epoch': 4.57}
{'loss': 3.181, 'learning_rate': 1.0571428571428572e-05, 'epoch': 4.71}
{'loss': 3.8194, 'learning_rate': 1.0285714285714285e-05, 'epoch': 4.86}
{'loss': 3.0478, 'learning_rate': 1e-05, 'epoch': 5.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.7538092136383057, 'eval_accuracy': 0.2, 'eval_runtime': 6.9509, 'eval_samples_per_second': 3.597, 'eval_steps_per_second': 0.144, 'epoch': 5.0}
{'loss': 2.8595, 'learning_rate': 9.714285714285715e-06, 'epoch': 5.14}
{'loss': 2.4274, 'learning_rate': 9.42857142857143e-06, 'epoch': 5.29}
{'loss': 2.6136, 'learning_rate': 9.142857142857144e-06, 'epoch': 5.43}
{'loss': 2.5741, 'learning_rate': 8.857142857142858e-06, 'epoch': 5.57}
{'loss': 3.3093, 'learning_rate': 8.571428571428571e-06, 'epoch': 5.71}
{'loss': 3.2258, 'learning_rate': 8.285714285714287e-06, 'epoch': 5.86}
{'loss': 2.9881, 'learning_rate': 8.000000000000001e-06, 'epoch': 6.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.7341525554656982, 'eval_accuracy': 0.24, 'eval_runtime': 6.5107, 'eval_samples_per_second': 3.84, 'eval_steps_per_second': 0.154, 'epoch': 6.0}
{'loss': 2.3822, 'learning_rate': 7.714285714285716e-06, 'epoch': 6.14}
{'loss': 2.6969, 'learning_rate': 7.428571428571429e-06, 'epoch': 6.29}
{'loss': 2.4871, 'learning_rate': 7.1428571428571436e-06, 'epoch': 6.43}
{'loss': 3.3003, 'learning_rate': 6.857142857142858e-06, 'epoch': 6.57}
{'loss': 2.6097, 'learning_rate': 6.571428571428572e-06, 'epoch': 6.71}
{'loss': 1.7309, 'learning_rate': 6.285714285714286e-06, 'epoch': 6.86}
{'loss': 3.0838, 'learning_rate': 6e-06, 'epoch': 7.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.7030651569366455, 'eval_accuracy': 0.24, 'eval_runtime': 6.6438, 'eval_samples_per_second': 3.763, 'eval_steps_per_second': 0.151, 'epoch': 7.0}
{'loss': 2.3973, 'learning_rate': 5.7142857142857145e-06, 'epoch': 7.14}
{'loss': 1.7912, 'learning_rate': 5.428571428571429e-06, 'epoch': 7.29}
{'loss': 2.1757, 'learning_rate': 5.142857142857142e-06, 'epoch': 7.43}
{'loss': 2.9636, 'learning_rate': 4.857142857142858e-06, 'epoch': 7.57}
{'loss': 2.7354, 'learning_rate': 4.571428571428572e-06, 'epoch': 7.71}
{'loss': 2.1267, 'learning_rate': 4.2857142857142855e-06, 'epoch': 7.86}
{'loss': 2.3166, 'learning_rate': 4.000000000000001e-06, 'epoch': 8.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.6693906784057617, 'eval_accuracy': 0.24, 'eval_runtime': 7.2241, 'eval_samples_per_second': 3.461, 'eval_steps_per_second': 0.138, 'epoch': 8.0}
{'loss': 1.8839, 'learning_rate': 3.7142857142857146e-06, 'epoch': 8.14}
{'loss': 2.1786, 'learning_rate': 3.428571428571429e-06, 'epoch': 8.29}
{'loss': 2.3221, 'learning_rate': 3.142857142857143e-06, 'epoch': 8.43}
{'loss': 2.4063, 'learning_rate': 2.8571428571428573e-06, 'epoch': 8.57}
{'loss': 2.5236, 'learning_rate': 2.571428571428571e-06, 'epoch': 8.71}
{'loss': 2.3093, 'learning_rate': 2.285714285714286e-06, 'epoch': 8.86}
{'loss': 2.0998, 'learning_rate': 2.0000000000000003e-06, 'epoch': 9.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.6437575817108154, 'eval_accuracy': 0.24, 'eval_runtime': 6.7409, 'eval_samples_per_second': 3.709, 'eval_steps_per_second': 0.148, 'epoch': 9.0}
{'loss': 1.3398, 'learning_rate': 1.7142857142857145e-06, 'epoch': 9.14}
{'loss': 2.5963, 'learning_rate': 1.4285714285714286e-06, 'epoch': 9.29}
{'loss': 2.5325, 'learning_rate': 1.142857142857143e-06, 'epoch': 9.43}
{'loss': 2.1737, 'learning_rate': 8.571428571428572e-07, 'epoch': 9.57}
{'loss': 2.4386, 'learning_rate': 5.714285714285715e-07, 'epoch': 9.71}
{'loss': 2.1589, 'learning_rate': 2.8571428571428575e-07, 'epoch': 9.86}
{'loss': 2.0316, 'learning_rate': 0.0, 'epoch': 10.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.6343860626220703, 'eval_accuracy': 0.24, 'eval_runtime': 6.7288, 'eval_samples_per_second': 3.715, 'eval_steps_per_second': 0.149, 'epoch': 10.0}
{'train_runtime': 936.487, 'train_samples_per_second': 1.068, 'train_steps_per_second': 0.075, 'train_loss': 3.6161604336329867, 'epoch': 10.0}


TrainOutput(global_step=70, training_loss=3.6161604336329867, metrics={'train_runtime': 936.487, 'train_samples_per_second': 1.068, 'train_steps_per_second': 0.075, 'train_loss': 3.6161604336329867, 'epoch': 10.0})

In [32]:
# To visualize metrics on test dataset
trainer2 = Trainer(
    model=model,
    args=training_args,
    train_dataset=trunc_train_dataset,
    eval_dataset=trunc_test_dataset,
    compute_metrics=compute_metrics,
)
trainer2.evaluate()

In [None]:
# Store the model
import tarfile
save_path = "./results/"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

with tarfile.open("./results/model.tar.gz", mode='w:gz') as archive:
    archive.add(save_path, arcname='.')