In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install datasets
!pip install transformers[torch]

from datasets import load_dataset

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.16.1 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.25.0-

In [3]:
data = load_dataset("PolyAI/minds14", "all", split='train')

data

Downloading data:   0%|          | 0.00/417M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/150M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8168 [00:00<?, ? examples/s]

Dataset({
    features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
    num_rows: 8168
})

In [4]:
df = data.train_test_split(test_size=0.2)

df

DatasetDict({
    train: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 6534
    })
    test: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 1634
    })
})

In [5]:
df = df.remove_columns(["path", "transcription", "english_transcription", "lang_id"])

df

DatasetDict({
    train: Dataset({
        features: ['audio', 'intent_class'],
        num_rows: 6534
    })
    test: Dataset({
        features: ['audio', 'intent_class'],
        num_rows: 1634
    })
})

In [6]:
labels = df["train"].features["intent_class"].names

labels


['abroad',
 'address',
 'app_error',
 'atm_limit',
 'balance',
 'business_loan',
 'card_issues',
 'cash_deposit',
 'direct_debit',
 'freeze',
 'high_value_payment',
 'joint_account',
 'latest_transactions',
 'pay_bill']

In [7]:
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [8]:
id2label[str(2)]

'app_error'

In [9]:
import random
from IPython.display import Audio, display

for _ in range(5):
    rand_idx = random.randint(0, len(df["train"])-1)
    example = df["train"][rand_idx]
    audio = example["audio"]

    print(f'Label: {id2label[str(example["intent_class"])]}')
    print(f'Shape: {audio["array"].shape}, sampling rate: {audio["sampling_rate"]}')
    display(Audio(audio["array"], rate=audio["sampling_rate"]))
    print()

Label: app_error
Shape: (362560,), sampling rate: 8000



Label: abroad
Shape: (108160,), sampling rate: 8000



Label: pay_bill
Shape: (26749,), sampling rate: 8000



Label: latest_transactions
Shape: (22528,), sampling rate: 8000



Label: business_loan
Shape: (167936,), sampling rate: 8000





# Preprocess

In [10]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
feature_extractor

preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000
}

In [11]:
from datasets import Audio

df = df.cast_column("audio", Audio(sampling_rate=16_000))
df["train"][0]

{'audio': {'path': 'response_42.wav',
  'array': array([-2.60974048e-05, -3.83240404e-05,  2.63581169e-05, ...,
          1.20302651e-03,  1.16391666e-03,  6.78404584e-04]),
  'sampling_rate': 16000},
 'intent_class': 7}

In [12]:
max_duration = 1.0  # seconds

In [13]:
import torch

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )

    inputs["intent_class"] = torch.tensor(examples["intent_class"], dtype=torch.long)
    return inputs

# def preprocess_function(examples):
#     audio_arrays = [x["array"] for x in examples["audio"]]
#     inputs = feature_extractor(
#         audio_arrays,
#         sampling_rate=feature_extractor.sampling_rate,
#         max_length=int(feature_extractor.sampling_rate * max_duration),
#         truncation=True,
#     )
#     return inputs

In [14]:
preprocess_function(df['train'][:5])

{'input_values': [array([-0.00173472, -0.00494884,  0.01205471, ..., -1.2097611 ,
       -1.0718683 , -0.6381143 ], dtype=float32), array([0.00818899, 0.00566552, 0.02408708, ..., 2.414321  , 2.1192036 ,
       1.3998224 ], dtype=float32), array([9.3741231e-03, 6.9007995e-03, 1.9404785e-03, ..., 2.3660672e+00,
       2.3968172e+00, 2.5528994e+00], dtype=float32), array([ 0.14635505, -0.01547317, -0.15352328, ...,  0.07275707,
       -0.01500838, -0.08233505], dtype=float32), array([-0.07802973, -0.05082509, -0.09534813, ..., -0.85056   ,
       -0.7569442 , -0.49389425], dtype=float32)], 'intent_class': tensor([ 7, 13,  3,  1, 10])}

In [15]:
# encoded_dataset = df.map(preprocess_function, remove_columns="audio", batched=True)
# # encoded_dataset = encoded_dataset.rename_column("intent_class", "label")

encoded_dataset = df.map(preprocess_function, remove_columns="audio", batched=True)
encoded_dataset

Map:   0%|          | 0/6534 [00:00<?, ? examples/s]

Map:   0%|          | 0/1634 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['intent_class', 'input_values'],
        num_rows: 6534
    })
    test: Dataset({
        features: ['intent_class', 'input_values'],
        num_rows: 1634
    })
})

In [16]:
encoded_dataset = encoded_dataset.rename_column("intent_class", "label")
encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'input_values'],
        num_rows: 6534
    })
    test: Dataset({
        features: ['label', 'input_values'],
        num_rows: 1634
    })
})

In [17]:
!pip install evaluate
import evaluate

accuracy = evaluate.load("accuracy")

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m81.9/84.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [18]:
import numpy as np


def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)


# def compute_metrics(eval_pred):
#     """Computes accuracy on a batch of predictions"""
#     predictions = np.argmax(eval_pred.predictions, axis=1)

#     # Convert labels to Long data type
#     references = eval_pred.label_ids.astype(np.int64)

#     return accuracy.compute(predictions=predictions, references=references)


In [19]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base",
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

# from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

# num_labels = len(id2label)
# model = AutoModelForAudioClassification.from_pretrained(
#     "facebook/wav2vec2-base",
#     num_labels=num_labels,
#     label2id={label: int(id) for label, id in label2id.items()},
#     id2label={int(id): label for id, label in id2label.items()},
# )




pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'classifier.weight', 'projector.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'projector.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
# import torch

# class MyTrainer(Trainer):
#     def compute_loss(self, model, inputs):
#         labels = inputs.pop("intent_class")
#         outputs = model(**inputs)
#         logits = outputs.logits
#         loss = torch.nn.CrossEntropyLoss()(logits, labels)
#         return loss

In [35]:
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/Colab Notebooks/Hasil',
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=4e-5,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=16,
    weight_decay=0.01, # regularisasi
    num_train_epochs=16,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    eval_steps=2,
    metric_for_best_model="accuracy",
)

In [36]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

In [37]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
0,2.4702,2.688329,0.095471
1,2.5862,2.667448,0.089351
2,2.5311,2.661681,0.085067
4,2.5057,2.702521,0.083843
5,2.4544,2.727803,0.079559
6,2.3645,2.750038,0.086903
8,2.2467,2.813452,0.076499
9,2.217,2.831311,0.089351
10,2.0995,2.889117,0.080171
12,2.0148,2.93935,0.076499




TrainOutput(global_step=1632, training_loss=2.2679529793414415, metrics={'train_runtime': 3581.1698, 'train_samples_per_second': 29.193, 'train_steps_per_second': 0.456, 'total_flos': 9.46914105969792e+17, 'train_loss': 2.2679529793414415, 'epoch': 15.96})