## 安装工具包

In [None]:
%%capture
!pip install git+https://github.com/huggingface/datasets.git
!pip install jiwer
!pip install torchaudio
#!pip install librosa
!pip install  pandas
!pip install evaluate
#!pip install torch
!pip install -U torch-summary
!pip install -U accelerate
!pip install -U transformers
!pip install scikit-learn

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## 从Google drive 加载数据集

In [None]:
!nvidia-smi

In [None]:
!cp -r /content/drive/MyDrive/iemocap4_longest_nopadding/test.hf /content/sample_data
!cp -r /content/drive/MyDrive/iemocap4_longest_nopadding/train.hf /content/sample_data
data_path = "/content/drive/MyDrive/iemocap4_longest_nopadding"
from datasets import load_from_disk
tokenized_test_dataset = load_from_disk("/content/sample_data/test.hf")
tokenized_train_dataset = load_from_disk("/content/sample_data/train.hf")

In [None]:
!cp -r /content/drive/MyDrive/iemocap4_5s/test.hf /content/sample_data
!cp -r /content/drive/MyDrive/iemocap4_5s/train.hf /content/sample_data
data_path = "/content/drive/MyDrive/iemocap4_5s"
from datasets import load_from_disk
tokenized_test_dataset = load_from_disk("/content/sample_data/test.hf")
tokenized_train_dataset = load_from_disk("/content/sample_data/train.hf")

In [None]:
from datasets import load_from_disk
tokenized_test_dataset = load_from_disk("/content/sample_data/test.hf")
tokenized_train_dataset = load_from_disk("/content/sample_data/train.hf")

In [None]:
tokenized_train_dataset[0]["input_values"].type()

In [None]:
print(tokenized_train_dataset[0]["input_values"].size())
print(tokenized_train_dataset[4289]["input_values"].size())
print(tokenized_test_dataset[160]["input_values"].size())
print(tokenized_test_dataset[1240]["input_values"].size())
tokenized_test_dataset.column_names

### 处理数据

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

!cp -r /content/drive/MyDrive/test.hf /content/sample_data
!cp -r /content/drive/MyDrive/train.hf /content/sample_data

from datasets import load_from_disk
test_dataset = load_from_disk("/content/sample_data/test.hf")
train_dataset = load_from_disk("/content/sample_data/train.hf")

In [None]:
import torch
import librosa
from datasets import load_dataset
from transformers import HubertForSequenceClassification, Wav2Vec2FeatureExtractor
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("wofeishenling/autotrain-iemocap_text_4-39809103601")
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")

tokenized_train_dataset = train_dataset.map(lambda example: feature_extractor(example["speech"],
                                        sampling_rate=16000,
                                        max_length=int(feature_extractor.sampling_rate * 5),
                                        truncation='max_length',
                                        padding = 'max_length',
                                        return_tensors="pt",
                                        return_attention_mask = True,
                                        ))
tokenized_test_dataset = test_dataset.map(lambda example: feature_extractor(example["speech"],
                                        sampling_rate=16000,
                                        max_length=int(feature_extractor.sampling_rate * 5),
                                        truncation='max_length',
                                        padding = 'max_length',
                                        return_tensors="pt",
                                        return_attention_mask = True
                                        ))

tokenized_train_dataset = tokenized_train_dataset.rename_column("attention_mask", "attention_mask_audio")
tokenized_test_dataset = tokenized_test_dataset.rename_column("attention_mask", "attention_mask_audio")

tokenized_train_dataset = tokenized_train_dataset.map(lambda example: tokenizer(example["transcription"], truncation=True, padding='max_length', max_length=128))
tokenized_test_dataset = tokenized_test_dataset.map(lambda example: tokenizer(example["transcription"], truncation=True, padding='max_length', max_length=128))

tokenized_train_dataset = tokenized_train_dataset.remove_columns(["file","audio","transcription","speech"])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["file","audio","transcription","speech"])
tokenized_train_dataset = tokenized_train_dataset.rename_column("label", "labels")
tokenized_test_dataset = tokenized_test_dataset.rename_column("label", "labels")

In [None]:
tokenized_test_dataset.column_names

In [None]:
tokenized_test_dataset.set_format("torch")
tokenized_train_dataset.set_format("torch")

In [None]:
def reshape_tensor(example):
    # Assume 'sample' is a tensor with shape [1, 5]
    example["input_values"] = example["input_values"].squeeze(0)
    example["attention_mask_audio"] = example["attention_mask_audio"].squeeze(0)
    return example
tokenized_train_dataset = tokenized_train_dataset.map(reshape_tensor)
tokenized_test_dataset = tokenized_test_dataset.map(reshape_tensor)

In [None]:
# tokenized_test_dataset.column_names
print(tokenized_train_dataset[0]["input_values"].size())
print(tokenized_train_dataset[1]["input_values"].size())
print(tokenized_test_dataset[160]["input_values"].size())
print(tokenized_test_dataset[161]["input_values"].size())

In [None]:
tokenized_test_dataset.save_to_disk("/content/drive/MyDrive/iemocap4_5s/test.hf")
tokenized_train_dataset.save_to_disk("/content/drive/MyDrive/iemocap4_5s/train.hf")

In [None]:
len_list = []
cnt = 0
for e in tokenized_train_dataset['input_values']:
  t = int(e.size()[0])/16000
  if t>15:
    cnt=cnt+1
  len_list.append(t)
print(cnt)

## TEXT-ONLY

In [None]:
from transformers import AutoProcessor, AutoModelForAudioClassification, AutoModelForPreTraining, AutoModelForMaskedLM, AutoModelForSequenceClassification
import torch.nn as nn
from transformers import AutoTokenizer
import torch
import librosa
from transformers import AutoModel
from transformers import BertModel
import math
import os
import warnings
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union
import torch.nn.init as init
import torch.utils.checkpoint
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

class text_only_model(nn.Module):
    def __init__(self):
        super(text_only_model, self).__init__()
        # 选择并加载baseModel
        self.bert_base = BertModel.from_pretrained("bert-base-uncased", output_hidden_states = True)
        #self.bert_base = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/myModel/bert_12layers", output_hidden_states=True)
        #self.bert_base = AutoModelForSequenceClassification.from_pretrained("JerryM/distilbert-base-uncased-finetuned-emotion",output_hidden_states=True)
        #self.bert_base = AutoModelForSequenceClassification.from_pretrained("wofeishenling/autotrain-iemocap_text_4-39809103601", output_hidden_states = True)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, 4)

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        #token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Tuple[torch.Tensor]:
        #print(model.M_12_1.weight)
        outputs = self.bert_base(
            input_ids,
            attention_mask=attention_mask,
            #token_type_ids=token_type_ids,
            #position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = outputs.hidden_states
        #print(hidden_states[1][:, 0, :].shape) #32*768
        # 取出模型的1-12层输出

        #取第12层作为分类器的输入
        logits = self.classifier(hidden_states[12][:, 0, :])
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, 4), labels.view(-1))

        output = (logits,)
        return ((loss,) + output)


In [None]:
# 实例化模型
model = text_only_model()

In [None]:
# 冻结bert模型的参数，使其不参与参数的更新
for param in model.bert_base.parameters():
    param.requires_grad = False

In [None]:
import torchsummary
torchsummary.summary(model)

### Trainer

In [None]:
from transformers import Trainer, TrainingArguments

# 设置训练参数
class CustomTrainer(Trainer):
    def create_optimizer(self):
        optimizer_grouped_parameters = [
            {
              "params": self.model.classifier.parameters(),
              "lr": 1e-3,
              "weight_decay": self.args.weight_decay
            },
            {
              "params": [self.model.w1, self.model.w2],
              "lr": 0.005,
              "weight_decay": self.args.weight_decay
            }
        ]
        self.optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate)
        return self.optimizer

training_args = TrainingArguments(
    output_dir="test_trainer", evaluation_strategy="epoch",
    learning_rate = 5e-5,
    weight_decay = 0.1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    )

In [None]:
import numpy as np
import evaluate
# 设置评价指标
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

### 自定义trian





In [None]:
num_epochs = 3

from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_train_dataset, shuffle=True, batch_size=8)
test_dataloader = DataLoader(tokenized_test_dataset, batch_size=8)
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

from transformers import get_scheduler

num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [None]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

In [None]:
from tqdm.notebook import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        outputs = model(batch['input_ids'].to(device), batch['attention_mask'].to(device), torch.tensor(batch['labels'].to(device)))
        loss = outputs[0]
        # print(loss)
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    print(loss)

In [None]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs[1]
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

## AUDIO-ONLY

In [None]:
from transformers import AutoProcessor, AutoModelForAudioClassification, AutoModelForPreTraining, AutoModelForMaskedLM, AutoModelForSequenceClassification
import torch.nn as nn
import torch
import librosa
from transformers import HubertForSequenceClassification, Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification
import torch.nn.init as init
from transformers import AutoModel
import math
import os
import warnings
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
class audio_only_model(nn.Module):
    def __init__(self):
        super(audio_only_model, self).__init__()
        # 选择basemodel并加载
        #--------------local_model---------------
        #self.model = HubertForSequenceClassification.from_pretrained("/content/drive/MyDrive/myModel/hubert", output_hidden_states = True)
        #self.model = Wav2Vec2ForSequenceClassification.from_pretrained("/content/drive/MyDrive/myModel/wav2vec2_superb", output_hidden_states = True)
        #self.model = HubertForSequenceClassification.from_pretrained("superb/hubert-base-superb-er", output_hidden_states = True)
        #self.model = AutoModelForPreTraining.from_pretrained("facebook/wav2vec2-base-960h", output_hidden_states = True)
        self.model = AutoModelForPreTraining.from_pretrained("facebook/wav2vec2-base", output_hidden_states = True)
        #self.model = Wav2Vec2ForSequenceClassification.from_pretrained("/content/drive/MyDrive/myModel/wav2vec2_superb_12layers", output_hidden_states = True)
        #self.model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-er", output_hidden_states = True)
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(768, 4)

    def forward(
        self,
        input_values: Optional[torch.Tensor] = None,
        attention_mask_audio: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Tuple[torch.Tensor]:
        #print(input_values.size())
        outputs = self.model(input_values, attention_mask=attention_mask_audio)
        hidden_states = outputs.hidden_states
        # 取出模型的1-12层输出
        hidden_M = [
            #hidden_states[0][:, 0, :],
            hidden_states[1][:, 0, :],
            hidden_states[2][:, 0, :],
            hidden_states[3][:, 0, :],
            hidden_states[4][:, 0, :],
            hidden_states[5][:, 0, :],
            hidden_states[6][:, 0, :],
            hidden_states[7][:, 0, :],
            hidden_states[8][:, 0, :],
            hidden_states[9][:, 0, :],
            hidden_states[10][:, 0, :],
            hidden_states[11][:, 0, :],
            hidden_states[12][:, 0, :],
        ]
        # 取第1层作为分类器的输入
        logits = self.classifier(hidden_states[1][:, 0, :])

        loss_fct = CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, 4), labels.view(-1))

        output = (logits,)
        return ((loss,) + output)


In [None]:
m = audio_only_model()

In [None]:
# 冻结参数
for param in m.model.parameters():
   param.requires_grad = False

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    evaluation_strategy="epoch",
    learning_rate = 1e-3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    fp16 = True,
    output_dir="wofeishenling/wofei",
    #push_to_hub=True,
    )

In [None]:
trainer = Trainer(
    model=m,
    #model = AutoModelForPreTraining.from_pretrained("facebook/wav2vec2-base-960h",
    #                        output_hidden_states = True,
    #                        num_labels = 4),
    args=training_args,
    # data_collator = data_collator,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=3), SaveBestModelCallback()]
)

In [None]:
trainer.train()

In [None]:
m.model.save_pretrained('/content/drive/MyDrive/myModel/wav2vec2_superb_12layers')

## FUSION MODEL

In [None]:
import torch
import librosa
from datasets import load_dataset
from transformers import HubertForSequenceClassification, Wav2Vec2FeatureExtractor
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import HubertForSequenceClassification, Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification

In [None]:
import torch
import torch.nn as nn
# 简单融合
class Concatenation(nn.Module):
    def __init__(self, feature_size=768):
        super(Concatenation, self).__init__()
        self.classifier = nn.Linear(768*2, 768)

    def forward(self, audio_features, text_features):
        cat_features = torch.cat((audio_features, text_features), dim=1)
        cat_features = self.classifier(cat_features)

        return cat_features

In [None]:
import torch
import torch.nn as nn
d_model = 768
nhead = 8
dropout = 0.1
layer_norm_eps = 1e-5
dim_feedforward = 3072
# 使用注意力机制融合模块
class CoAttention(nn.Module):
    def __init__(self, feature_size=768):
        super(CoAttention, self).__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead)
        self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.dropout = nn.Dropout(dropout)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.activation = nn.ReLU()
        self.linear2 = nn.Linear(dim_feedforward, d_model)
        self.linear1 = nn.Linear(d_model, dim_feedforward)

    def forward(self, af, tf):
        x = self.norm1(af + self._sa_block(af, tf, tf))
        x = self.norm2(x + self._ff_block(x))

        y = self.norm1(tf + self._sa_block(tf, af, af))
        y = self.norm2(y + self._ff_block(y))

        x1 = self.norm1(x + self._sa_block(x, y, y))
        x1 = self.norm2(x1 + self._ff_block(x1))

        y1 = self.norm1(y + self._sa_block(y, x, x))
        y1 = self.norm2(y1 + self._ff_block(y1))

        x2 = self.norm1(x1 + self._sa_block(x1, y1, y1))
        x2 = self.norm2(x2 + self._ff_block(x2))

        y2 = self.norm1(y1 + self._sa_block(y1, x1, x1))
        y2 = self.norm2(y2 + self._ff_block(y2))

        fused_features = (x+y)/2
        return fused_features

    def _sa_block(self, q, k, v):
        x = self.self_attn(q, k, v, need_weights=False)[0]
        return self.dropout1(x)

    # feed forward block
    def _ff_block(self, x):
        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
        return self.dropout2(x)



In [None]:
from transformers import AutoProcessor, AutoModelForAudioClassification, AutoModelForPreTraining, AutoModelForMaskedLM, AutoModelForSequenceClassification
import torch.nn as nn
from transformers import AutoModel
from transformers import BertModel
import math
import os
import warnings
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
class FusionModel(nn.Module):
    def __init__(self):
        super(FusionModel, self).__init__()

        # 分别选择文本basemodel以及语音的basemodel
        #self.text_model = BertModel.from_pretrained("bert-base-uncased", output_hidden_states = True)
        #self.audio_model = AutoModelForPreTraining.from_pretrained("facebook/wav2vec2-base", output_hidden_states = True)
        self.audio_model = Wav2Vec2ForSequenceClassification.from_pretrained("/content/drive/MyDrive/myModel/wav2vec2_superb", output_hidden_states = True)
        #self.audio_model = Wav2Vec2ForSequenceClassification.from_pretrained("facebook/wav2vec2-base", output_hidden_states = True)
        self.text_model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/myModel/bert_12layers", output_hidden_states=True)

        # 选择融合机制
        self.fusion_model = CoAttention()
        #self.fusion_model = Concatenation()
        self.dropout = nn.Dropout(0.1)
        self.linear1 = nn.Linear(768*2, 4)
        self.linear = nn.Linear(768, 4) # output features from bert is 768 and 2 is ur number of labels

    def forward(
        self,
        input_values: Optional[torch.Tensor] = None,
        attention_mask_audio: Optional[torch.Tensor] = None,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        return_dict: Optional[bool] = None,
    ) -> Tuple[torch.Tensor]:
        outputs_audio = self.audio_model(input_values, attention_mask=attention_mask_audio)
        outputs_text  = self.text_model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        ## 指定融合的匹配策略
        #layer_fusion1 = self.fusion_model(outputs_audio.hidden_states[1][:,0,:],outputs_text.hidden_states[1][:,0,:])
        #layer_fusion2 = self.fusion_model(outputs_audio.hidden_states[1][:,0,:],outputs_text.hidden_states[5][:,0,:])
        #layer_fusion3 = self.fusion_model(outputs_audio.hidden_states[2][:,0,:],outputs_text.hidden_states[6][:,0,:])
        #layer_fusion4 = self.fusion_model(outputs_audio.hidden_states[3][:,0,:],outputs_text.hidden_states[7][:,0,:])
        layer_fusion5 = self.fusion_model(outputs_audio.hidden_states[8][:,0,:],outputs_text.hidden_states[1][:,0,:])
        layer_fusion6 = self.fusion_model(outputs_audio.hidden_states[9][:,0,:],outputs_text.hidden_states[2][:,0,:])
        layer_fusion7 = self.fusion_model(outputs_audio.hidden_states[10][:,0,:],outputs_text.hidden_states[3][:,0,:])
        layer_fusion8 = self.fusion_model(outputs_audio.hidden_states[11][:,0,:],outputs_text.hidden_states[4][:,0,:])
        layer_fusion9 = self.fusion_model(outputs_audio.hidden_states[12][:,0,:],outputs_text.hidden_states[5][:,0,:])
        #layer_fusion10 = self.fusion_model(outputs_audio.hidden_states[1][:,0,:],outputs_text.hidden_states[10][:,0,:])
        #layer_fusion11 = self.fusion_model(outputs_audio.hidden_states[7][:,0,:],outputs_text.hidden_states[11][:,0,:])
        #layer_fusion12 = self.fusion_model(outputs_audio.hidden_states[8][:,0,:],outputs_text.hidden_states[12][:,0,:])

        #outputs_fusion = self.w1*layer_fusion1 + self.w2*layer_fusion2 + self.w3*layer_fusion3 + self.w4*layer_fusion4 + self.w5*layer_fusion5 + self.w6*layer_fusion6 + self.w7*layer_fusion7 + self.w8*layer_fusion8 + self.w9*layer_fusion9 + self.w10*layer_fusion10 + self.w11*layer_fusion11 + self.w12*layer_fusion12
        outputs_fusion = (layer_fusion5 + layer_fusion6 + layer_fusion7 + layer_fusion8 + layer_fusion9)/5.0
        # + layer_fusion10 + layer_fusion11 + layer_fusion12
        #outputs_fusion = 0.0002443*layer_fusion1 + 0.0004886*layer_fusion2 + 0.0009772*layer_fusion3 + 0.0019544*layer_fusion4 + 0.0039088*layer_fusion5 + 0.0078176*layer_fusion6 + 0.0156352*layer_fusion7 + 0.0312704*layer_fusion8 + 0.0625408*layer_fusion9 + 0.1250816*layer_fusion10 + 0.2501632*layer_fusion11 + 0.5003264*layer_fusion12


        # 将融合好的特征进行分类
        logits = self.linear(outputs_fusion)
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, 4), labels.view(-1))

        output = (logits,)
        return ((loss,) + output)

In [None]:
model = FusionModel()

In [None]:
for param in model.audio_model.parameters():
    param.requires_grad = False

for param in model.text_model.parameters():
    param.requires_grad = False

In [None]:
from transformers import TrainingArguments, Trainer, TrainerCallback

training_args = TrainingArguments(
    output_dir="test_trainer", evaluation_strategy="epoch",
    learning_rate = 5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10
    )

In [None]:
import numpy as np
import evaluate
from sklearn.metrics import accuracy_score
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import Trainer, TrainingArguments
import torch

class CustomTrainer(Trainer):
    def create_optimizer(self):
        optimizer_grouped_parameters = [
            {
              "params": self.model.linear.parameters(),
              "lr": 1e-3,
              "weight_decay": self.args.weight_decay
            },
            {
              "params": self.model.fusion_model.parameters(),
              "lr": self.args.learning_rate,
              "weight_decay": self.args.weight_decay
            }
        ]
        self.optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate)
        return self.optimizer

In [None]:
class MyCallback(TrainerCallback):
    "A callback that prints a message at the beginning of training"
    # epoch = 1
    def on_epoch_begin(self, args, state, control, model, **kwargs):
      model.audio_model.save_pretrained(f'/content/drive/MyDrive/myModel/skew4/wav2vec2_{state.epoch}_epoch')
      model.text_model.save_pretrained(f'/content/drive/MyDrive/myModel/skew4/bert_{state.epoch}_epoch')
      #self.epoch = self.epoch + 1
      print(f"save{state.epoch}")

In [None]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    # data_collator=data_collator,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    #tokenizer = feature_extractor,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
torch.cuda.empty_cache()