## load dataset

In [None]:
from IPython.display import clear_output
import os

In [None]:
!pwd

In [None]:
if not os.path.exists("dev.zip"):
  !wget "https://huggingface.co/datasets/gabrielrstan/CORAA-v1.1/resolve/main/dev.zip" -P "."

if not os.path.isdir("dev/"):
  !unzip "dev.zip" -d "."

if os.path.isdir("dev/"):
  clear_output()
  print("data unzipped")
else:
  # !rm /content/dev.zip
  raise Exception("Cannot unzip.")

In [None]:
if not os.path.exists("metadata_dev_final.csv"):
  !wget "https://huggingface.co/datasets/gabrielrstan/CORAA-v1.1/resolve/main/metadata_dev_final.csv" -P "."

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('metadata_dev_final.csv')
df.head()

In [None]:
df[['up_votes', 'down_votes']].describe()

## manipulate dataset

In [None]:
temp_df = df.copy()
temp_df = temp_df[temp_df['up_votes'] > 0]
temp_df = temp_df[temp_df['down_votes'] == 0]
temp_df = temp_df.reset_index(drop=True)
temp_df

In [None]:
df_hesitation = temp_df[['file_path','votes_for_hesitation']].dropna()
df_hesitation['has_hesitation'] = (df_hesitation['votes_for_hesitation'] > 0).astype(int)
df_hesitation

In [None]:
import librosa

# remove audios with min_limite length

MINIMUM_DURATION = 5 # seconds

def audiofile_duration(file_path: str) -> float:
  waveform, sample_rate = librosa.load(file_path)
  duration = librosa.get_duration(y=waveform, sr=sample_rate)
  return duration

def get_df_with_minimum_duration(df:pd.DataFrame, minimum_duration:float) -> pd.DataFrame:
  """
  df: dataframe containing 'file_path'
  minimum_duration: minimum audio duration
  """
  df['audio_duration'] = df['file_path'].map(audiofile_duration)
  df = df[df['audio_duration'] > minimum_duration]
  return df.reset_index(drop=True)

df_hesitation = get_df_with_minimum_duration(df_hesitation, MINIMUM_DURATION)
df_hesitation

In [None]:
df_hesitation['has_hesitation'].value_counts()

In [None]:
from random import sample

has_hesitation_count = df_hesitation['has_hesitation'].value_counts()[1]
not_has_hesitation_count = df_hesitation['has_hesitation'].value_counts()[0]
to_remove = sample(list(df_hesitation[df_hesitation['has_hesitation'] == 0].index), not_has_hesitation_count-has_hesitation_count)
df_hesitation = df_hesitation.drop(to_remove).reset_index(drop=True)

In [None]:
df_hesitation['has_hesitation'].value_counts()

In [None]:
df_hesitation['audio_duration'].describe()

## setting model

In [None]:
from transformers import AutoFeatureExtractor, ASTForAudioClassification
from transformers import AutoProcessor, AutoModelForAudioClassification
import torch
from torch import nn

In [None]:
from transformers import Wav2Vec2FeatureExtractor

In [None]:
device = torch.device("cuda")

# feature_extractor = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, padding_side="right", sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
# processor = AutoProcessor.from_pretrained("alefiury/wav2vec2-xls-r-300m-pt-br-spontaneous-speech-emotion-recognition")

def get_model(dropout: float):
  # model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
  model = AutoModelForAudioClassification.from_pretrained("alefiury/wav2vec2-xls-r-300m-pt-br-spontaneous-speech-emotion-recognition")

  # MIT/ast-finetuned-audioset-10-10-0.4593
  # dense_in_features = model.classifier.dense.in_features
  # n_classes = 2
  # model.classifier.dense = nn.Sequential(
  #     nn.Dropout(dropout, inplace=True),
  #     nn.Linear(in_features=dense_in_features, out_features=n_classes),
  # )


  # alefiury/wav2vec2-xls-r-300m-pt-br-spontaneous-speech-emotion-recognition
  dense_in_features = model.classifier.in_features
  n_classes = 2
  model.classifier = nn.Sequential(
      nn.Dropout(dropout, inplace=True),
      nn.Linear(in_features=dense_in_features, out_features=n_classes),
  )
  model

  
  for param in model.parameters():
    param.requires_grad = False

  for param in model.classifier.parameters():
      param.requires_grad = True
      
  return model

## setting dataset to torch dataset

In [None]:
import librosa
from torch.nn.functional import cross_entropy

def get_features(file_paths:list) -> torch.Tensor:
  sampling_rate = 16_000
  waveforms = []
  for file_path in file_paths:
    waveform, original_samplerate = librosa.load(file_path)
    waveform = librosa.resample(waveform, orig_sr=original_samplerate, target_sr=sampling_rate)
    waveforms.append(waveform)

  # MIT/ast-finetuned-audioset-10-10-0.4593
  #features = feature_extractor(waveforms, sampling_rate=sampling_rate, return_tensors="pt")

  # alefiury/wav2vec2-xls-r-300m-pt-br-spontaneous-speech-emotion-recognition
  features = feature_extractor(waveforms, sampling_rate=sampling_rate, return_tensors="pt", padding=True, truncation=True, max_length=10000)
  
  return features

In [None]:
features = get_features(df_hesitation['file_path'])
features['input_values'].shape

In [None]:
features['labels'] = torch.tensor(df_hesitation['has_hesitation'],dtype=torch.int64)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features['input_values'], features['labels'], test_size=0.2, random_state=1)

In [None]:
from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True)

## training model

In [None]:
def eval_model(model) -> tuple:
  """
  return:
  loss
  accuracy
  """
  model.eval()

  running_loss = 0
  running_corrects = 0
  for i, (inputs, labels) in enumerate(test_dataloader):
    inputs, labels = inputs.to(device), labels.to(device)
    with torch.no_grad():
      logits = model(inputs).logits
      loss = cross_entropy(logits, labels)
      predicted_class = torch.argmax(logits, dim=-1)

      running_loss += loss.item()
      running_corrects += sum(labels == predicted_class)
  loss = (running_loss/len(test_dataloader))
  accuracy = running_corrects/(len(test_dataloader) * test_dataloader.batch_size)
  return loss, accuracy

In [None]:
import ray

put_train = ray.put(train_dataloader)
put_test = ray.put(test_dataloader)

In [None]:
for i, (inputs, labels) in enumerate(test_dataloader):
    inputs, labels = inputs.to(device), labels.to(device)

In [None]:
labels

In [None]:
def train_func(model, dataloader, optimizer, exp_lr_scheduler, clip_value):
  total = 0
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model.train()
  running_loss = 0
  correct = 0
  for i, (data, target) in enumerate(dataloader):
    data, target = data.to(device), target.to(device)

    optimizer.zero_grad()

    output = model(data).logits
    loss = cross_entropy(output, target)

    total += output.size(0)
    running_loss += loss.item() * output.size(0)

    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.classifier.parameters(), clip_value)
    optimizer.step()
        # accuracy
    _, predicted = torch.max(output.data, 1)
    # _, correct_class = torch.max(target.data, 1)

    correct += (predicted == target).sum().item()

  exp_lr_scheduler.step()

  return {
      "mean_loss": running_loss / total,
      "mean_accuracy": correct / total,
  }

def test_func(model, dataloader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    correct = 0
    total = 0
    running_loss = 0
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(dataloader):

            data, target = data.to(device), target.to(device)
            outputs = model(data).logits

            # accuracy
            _, predicted = torch.max(outputs.data, 1)
            # _, correct_class = torch.max(target.data, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()

            # loss
            running_loss += cross_entropy(outputs, target).item() * outputs.size(0)

    return {
        "mean_loss": running_loss / total,
        "mean_accuracy": correct / total,
    }

In [None]:
import os
import tempfile

from torch import optim

from ray import train
from ray.train import Checkpoint

def train_hesitation(config, max_epochs=30, tunning=True):
    train_dataloader = ray.get(put_train)
    test_dataloader = ray.get(put_test)

    # Data Setup
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = get_model(config['classifier_dropout'])
    model.to(device)

    optimizer = optim.SGD(
        model.classifier.parameters(),
        lr=config["lr"],
        momentum=config["momentum"],
        weight_decay=config['weight_decay'],
        nesterov=config['nesterov']
    )

    exp_lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=config['lr_scheduler_gamma'])
    for i in range(max_epochs):
        train_log = train_func(model, train_dataloader, optimizer, exp_lr_scheduler, config['clip_value'])
        val_log = test_func(model, test_dataloader)

        if tunning:
            with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
                checkpoint = None
                if (i + 1) % max_epochs == 0 and (val_log["mean_loss"] < 0.4):
                    # This saves the model to the trial directory
                    torch.save(
                        model.state_dict(),
                        os.path.join(temp_checkpoint_dir, "model.pth")
                    )
                    checkpoint = Checkpoint.from_directory(temp_checkpoint_dir)

                # Send the current training result back to Tune
                train.report(
                    {
                        "train_mean_loss": train_log["mean_loss"],
                        "train_mean_accuracy": train_log["mean_accuracy"],
                        "val_mean_loss": val_log["mean_loss"],
                        "val_mean_accuracy": val_log["mean_accuracy"],
                    },
                    checkpoint=checkpoint
                )
        else:
            print("-"*10, f"epoch: {i+1}/{max_epochs}","-"*10)
            print(f"train: {train_log}\nval: {val_log}")
    if not tunning:
        return {
            "model": model,
            "log": {
                "train": train_log,
                "val": val_log,
            },
        }

In [None]:
max_cpus = os.cpu_count()
max_gpus = torch.cuda.device_count()
max_cpus, max_gpus

In [None]:
from ray import tune

from ray.tune.search.optuna import OptunaSearch

from ray.tune.schedulers import ASHAScheduler


config = {
    "lr": tune.loguniform(1e-5, 1e-0),
    "momentum": tune.uniform(0.1, 0.9),
    "classifier_dropout": tune.uniform(0.3, 0.7),
    "weight_decay": tune.loguniform(1e-6, 1e-2),
    "clip_value": tune.randint(1, 5+1),
    "lr_scheduler_gamma": tune.uniform(0.5, 1.0),
    #"image_net": tune.choice(["IMAGENET1K_V1", "IMAGENET1K_V2"]),
    "nesterov": tune.choice([True, False]),
    #"transfer_model": tune.choice(["resnet152", "mobiletenet_v3_large"])
}

metric = "val_mean_loss"
mode = "min"

optuna_search = OptunaSearch(
    metric=metric,
    mode=mode,
    # points_to_evaluate = curr_best_params,
)

asas_scheduler = ASHAScheduler(
    time_attr='training_iteration',
    metric=metric,
    mode=mode,
    max_t=5,
    grace_period=1,
    reduction_factor=3,
    brackets=2
)

trainable_with_resources = tune.with_resources(train_hesitation, {"cpu": max_cpus, "gpu": max_gpus})

tuner = tune.Tuner(
    trainable_with_resources,
    tune_config=tune.TuneConfig(
        num_samples=100,
        search_alg=optuna_search,
        scheduler=asas_scheduler
    ),
    param_space=config,
)
results = tuner.fit()

In [None]:
results.get_dataframe().to_csv("raytune_results.csv")

In [None]:
df_results = results.get_dataframe()
df_results.head()

In [None]:
from sklearn.metrics import f1_score

scores = f1_score(df_results['val_mean_accuracy'].map.plot(kind='hist', title='val_mean_accuracy hist')

In [None]:
best_result = results.get_best_result("val_mean_accuracy", mode="max")
best_result.metrics

In [None]:
best_result = results.get_best_result("val_mean_loss", mode="min")
best_result.metrics

In [None]:
import json

with open("best_result.json", 'w') as f:
    json.dump(best_result.config, f, default=str)

In [None]:
best_config_train_model = train_hesitation(best_result.config, max_epochs=8, tunning=False)

## evaluating model

In [None]:
model = best_config_train_model['model']

In [None]:
model.eval()
all_labels = []
all_preds = []
for i, (inputs, labels) in enumerate(test_dataloader):
  inputs, labels = inputs.to(device), labels.to(device)
  with torch.no_grad():
    logits = model(inputs).logits
    predicted_class_ids = torch.argmax(logits, dim=-1)

  all_labels.append(labels)
  all_preds.append(predicted_class_ids)

all_labels = torch.concat(all_labels).cpu()
all_preds = torch.concat(all_preds).cpu()

all_labels.shape, all_preds.shape

In [None]:
classes_names = ['ausent', 'hesitation']

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, accuracy_score

In [None]:
cm = confusion_matrix(all_labels, all_preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classes_names)
disp.plot()
plt.show()
print("\n")
print(classification_report(all_labels, all_preds, target_names=classes_names))
print("\naccuracy:", accuracy_score(all_labels, all_preds))