# ASR to Bert Sentiment Analysis

In [1]:
# Installation des bibliothèques nécessaires
!pip install torch torchaudio transformers huggingface_hub
!pip install speechbrain
!pip install tqdm
!pip install wandb

Collecting speechbrain
  Downloading speechbrain-0.5.15-py3-none-any.whl (553 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m553.8/553.8 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting hyperpyyaml (from speechbrain)
  Downloading HyperPyYAML-1.2.2-py3-none-any.whl (16 kB)
Installing collected packages: hyperpyyaml, speechbrain
Successfully installed hyperpyyaml-1.2.2 speechbrain-0.5.15


In [2]:
# Connexion sur huggingface hub
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Première Partie
### Mise en place du modèle ASR

In [9]:
import torch
import librosa
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# Chargement du modèle
MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-french"
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)

# Pretraitement de l'audio
audiopath = "/kaggle/input/voices/common_voice_fr_33153455.wav"
speech_array, sampling_rate = librosa.load(audiopath, sr=16_000)
inputs = processor(speech_array, sampling_rate=16_000, return_tensors="pt", padding=True)

# inference
with torch.no_grad():
    logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits

predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
print(f"La transcription de l'audio est {transcription}")


La transcription de l'audio est ['blessé pendant la guerre il est soigné à saumur puis à paris']


## Deuxieme partie
### Ici, il s'agit de faire le finetuning d'un modèle Bert avec les données français de allocine movies review afin de construire un modèle permettant d'analyser le sentiment.
### L'inférence (ASR et Bert) s'est faite avec un demo sur fastAPI

In [5]:
# Import les bibliothèques et modules nécessaires pour finetuner le modèle de Bert
import os
import torch
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from transformers import BertModel, AutoTokenizer, BertConfig
import huggingface_hub
from huggingface_hub import PyTorchModelHubMixin
from torch.optim import AdamW
import torch.nn as nn
from sklearn.model_selection import train_test_split
import pandas as pd
import json
from tqdm import tqdm
import wandb   # pour monitorer l'entrainement
import huggingface_hub



In [6]:
# Definir le fichier de configuration utilisé pour le modèle
config = {
    "model_name": "nlptown/bert-base-multilingual-uncased-sentiment",
    "max_length": 80,
    "trainfile": "/kaggle/input/allocine-movies-review/train.csv",
    "testfile": "/kaggle/input/allocine-movies-review/test.csv",
    "valfile": "/kaggle/input/allocine-movies-review/valid.csv",
    "batch_size": 10,
    "learning_rate": 2e-5,
    "n_epochs": 4,
    "n_classes": 1,
    "device": torch.device("cuda" if torch.cuda.is_available else "cpu")

}

# Définir la classe pour le charger et l'indexation des données
class MyDataset(Dataset):
    def __init__(self, csvfile, tokenizer_name, max_length):
        self.df = pd.read_csv(csvfile)
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.df['review'][index]
        label = self.df['polarity'][index]

        inputs = self.tokenizer(text=text, max_length = self.max_length, padding = 'max_length', truncation =True, return_tensors='pt')

        return {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            'label': torch.tensor(label)

        }

# Method permettant de charger les données par batch
def dataloader(dataset, batch_size, shuffle):

    return DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle)

# Definir la classe du modèle
class SentimentAnalysisBertModel(nn.Module, PyTorchModelHubMixin):
    def __init__(self):
        super(SentimentAnalysisBertModel, self).__init__()
        self.pretrained_model = BertModel.from_pretrained(config['model_name'])   # bert base 768 hidden state
        self.classifier = nn.Linear(768, config['n_classes'])  # MLP

    def forward(self, input_ids, attention_mask):

        output = self.pretrained_model(input_ids = input_ids, attention_mask = attention_mask)    # batch de 768
        output = self.classifier(output.last_hidden_state)

        return output

# Method pour l'entrainement des données
def train_step(model, train_loader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0

    for data in tqdm(train_loader, total = len(train_loader)):
        input_ids = data['input_ids'].squeeze(1).to(device)
        attention_mask = data['attention_mask'].to(device)
        label = data['label'].to(device)

        optimizer.zero_grad()

        output = model(input_ids, attention_mask)

        loss = loss_fn(output, label.unsqueeze(1))

        loss.backward()

        optimizer.step()

        total_loss += loss.item()

    return total_loss/len(train_loader)

# Method pour la validation les données
def validation_step(model, validation_loader, loss_fn, device):

    total_loss = 0
    correct_prediction = 0

    with torch.no_grad():
        for data in tqdm(validation_loader, total=len(validation_loader)):
            input_ids = data['input_ids'].squeeze(1).to(device)
            attention_mask = data['attention_mask'].to(device)
            label = data['label'].to(device)

            output = model(input_ids, attention_mask)

            loss = loss_fn(output, label.unsqueeze(1))

            pred = torch.max(torch.softmax(output, dim=1), dim=1)

            total_loss += loss.item()

            correct_prediction += torch.sum(pred.indices==label)

    return total_loss/len(validation_loader), correct_prediction/len(validation_loader)

# Method pour faire le test du mod_le avec les données de test
def test_step(model, test_loader, loss_fn, device):
    model.eval()
    total_loss = 0
    correct_prediction = 0

    with torch.no_grad():
        for data in tqdm(test_loader, total=len(test_loader)):
            input_ids = data['input_ids'].squeeze(1).to(device)
            attention_mask = data['attention_mask'].to(device)
            label = data['label'].to(device)

            output = model(input_ids, attention_mask)

            loss = loss_fn(output, label.unsqueeze(1))

            pred = torch.max(torch.softmax(output, dim=1), dim=1)

            total_loss += loss.item()

            correct_prediction += torch.sum(pred.indices == label)

    return total_loss / len(test_loader), correct_prediction / len(test_loader)



In [7]:
def main():

    tokenizer = AutoTokenizer.from_pretrained(config["model_name"])

    wandb.init(project="asr-bert-sentiment-analysis")

    train_dataset = MyDataset(config['trainfile'], config['model_name'], config['max_length'])
    
    test_dataset = MyDataset(config['testfile'], config['model_name'], config['max_length'])
    
    val_dataset = MyDataset(config['valfile'], config['model_name'], config['max_length'])

    train_loader = dataloader(train_dataset, config['batch_size'], shuffle = True)

    validation_loader = dataloader(val_dataset, config['batch_size'], shuffle = False)

    test_loader = dataloader(test_dataset, config['batch_size'], shuffle=False)
    
    train_data = next(iter(train_loader))

    model = SentimentAnalysisBertModel()

    model.to(config['device'])

    #output = model(data['input_ids'].squeeze(1), data['attention_mask'])

    optimizer = AdamW(model.parameters(), lr = config['learning_rate'])

    loss_fn = nn.CrossEntropyLoss()

    for epoch in range(config['n_epochs']):
        loss_train = train_step(model, train_loader, optimizer, loss_fn, config['device'])
        loss_validation, accuracy_validation = validation_step(model, validation_loader, loss_fn, config['device'])
        loss_test, accuracy_test = test_step(model, test_loader, loss_fn, config['device'])
        
        wandb.log({
            "loss_train": loss_train,
            "loss_validation": loss_validation,
            "accuracy_validation": accuracy_validation,
            "loss_test": loss_test,
            "accuracy_test": accuracy_test
        })
        
    # Push model to the Hub
    model.push_to_hub("Fatou/asr2bert-sentimentanalysis")
    tokenizer.push_to_hub("Fatou/asr2bert-sentimentanalysis")
    model.pretrained_model.config.push_to_hub("Fatou/asr2bert-sentimentanalysis")

    # Affichage des variables enregistrées dans wandb.log à la fin de l'exécution
    wandb.run.finish()  # Termine l'exécution WandB
    history = wandb.run.history()
        
        
    



In [8]:
if __name__ == '__main__':
    main()

Downloading (…)okenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Downloading pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

100%|██████████| 16000/16000 [32:27<00:00,  8.22it/s]
100%|██████████| 2000/2000 [01:13<00:00, 27.03it/s]
100%|██████████| 2000/2000 [01:12<00:00, 27.46it/s]
 82%|████████▏ | 13108/16000 [26:47<05:52,  8.21it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

100%|██████████| 16000/16000 [32:42<00:00,  8.15it/s]
100%|██████████| 2000/2000 [01:14<00:00, 26.87it/s]
100%|██████████| 2000/2000 [01:13<00:00, 27.35it/s]
100%|██████████| 16000/16000 [32:55<00:00,  8.10it/s]
100%|██████████| 2000/2000 [01:17<00:00, 25.88it/s]
100%|██████████| 2000/2000 [01:16<00:00, 26.31it/s]
100%|██████████| 16000/16000 [32:30<00:00,  8.20it/s]
100%|██████████| 2000/2000 [01:14<00:00, 26.94it/s]
100%|██████████| 2000/2000 [01:12<00:00, 2

pytorch_model.bin:   0%|          | 0.00/670M [00:00<?, ?B/s]

0,1
accuracy_test,▄█▁▄
accuracy_validation,█▇▁█
loss_test,▂▁█▆
loss_train,█▅▃▁
loss_validation,█▁█▅

0,1
accuracy_test,54.355
accuracy_validation,54.415
loss_test,0.22209
loss_train,0.11426
loss_validation,0.21872


AttributeError: 'NoneType' object has no attribute 'history'