In [2]:
%load_ext autoreload
%autoreload 2
import datasets
import utils
from transformers import BertTokenizer
import transformers
import accelerate
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_from_disk
import torch

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kadem\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kadem\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [1]:
!pip install accelerate>=0.21.0
!pip install transformers

Collecting transformers
  Using cached transformers-4.41.0-py3-none-any.whl.metadata (43 kB)
Using cached transformers-4.41.0-py3-none-any.whl (9.1 MB)
Installing collected packages: transformers
Successfully installed transformers-4.41.0


### Charger le dataset

In [3]:
wikiqa_data = datasets.load_from_disk("wikiqa")
test_data_set = wikiqa_data["test"]
train_data_set = wikiqa_data["train"]
validation_data_set = wikiqa_data["validation"]


In [4]:
wikiqa_data

DatasetDict({
    test: Dataset({
        features: ['question_id', 'question', 'document_title', 'answer', 'label'],
        num_rows: 6165
    })
    validation: Dataset({
        features: ['question_id', 'question', 'document_title', 'answer', 'label'],
        num_rows: 2733
    })
    train: Dataset({
        features: ['question_id', 'question', 'document_title', 'answer', 'label'],
        num_rows: 20360
    })
})

### Prétraitement des données

In [5]:
def preprocess_examples(examples):
    examples['question'] = [utils.preprocess(q) for q in examples['question']]
    examples['answer'] = [utils.preprocess(a) for a in examples['answer']]
    return examples

train_data_set = train_data_set.map(preprocess_examples, batched=True)
validation_data_set = validation_data_set.map(preprocess_examples, batched=True)
test_data_set = test_data_set.map(preprocess_examples, batched=True)

Map: 100%|██████████| 20360/20360 [00:00<00:00, 53328.79 examples/s]
Map: 100%|██████████| 2733/2733 [00:00<00:00, 44762.78 examples/s]
Map: 100%|██████████| 6165/6165 [00:00<00:00, 54940.21 examples/s]


### Tokenisation et préparation des tenseurs

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['question'], examples['answer'], padding='max_length', truncation=True)

# Appliquer la fonction de tokenisation aux datasets
tokenized_train = train_data_set.map(tokenize_function, batched=True)
tokenized_validation = validation_data_set.map(tokenize_function, batched=True)
tokenized_test = test_data_set.map(tokenize_function, batched=True)

# Préparation des tenseurs pour l'entraînement (Transformation en tenseurs PyTorch)
tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_validation.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Map: 100%|██████████| 20360/20360 [00:10<00:00, 1882.61 examples/s]
Map: 100%|██████████| 2733/2733 [00:01<00:00, 1889.00 examples/s]
Map: 100%|██████████| 6165/6165 [00:03<00:00, 1968.82 examples/s]


### Définir et entraîner le modèle

In [8]:
# Initialiser le modèle
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
import torch
print(torch.cuda.is_available())  # Cela devrait retourner True si un GPU est disponible


False


In [12]:
!pip uninstall torch
!pip install torch --extra-index-url https://download.pytorch.org/whl/cu118


In [None]:
!nvcc --version


In [9]:
# Définir les arguments d'entraînement
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",  # Utilisez eval_strategy au lieu de evaluation_strategy
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialiser le Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
)

# Entraîner le modèle
trainer.train()

  0%|          | 18/3819 [15:01<67:52:10, 64.28s/it]

KeyboardInterrupt: 

In [55]:
!pip uninstall accelerate transformers

^C


In [None]:
!pip install accelerate>=0.21.0 transformers

In [None]:
!pip cache purge


### Évaluation et sauvegarde

In [None]:
eval_results = trainer.evaluate()
print(eval_results)

test_results = trainer.predict(tokenized_test)
print(test_results)

model.save_pretrained('./models/wikiqa-bert-model')
tokenizer.save_pretrained('./models/wikiqa-bert-tokenizer')

In [42]:
import transformers
import accelerate

print("Transformers version:", transformers.__version__)
print("Accelerate version:", accelerate.__version__)


Transformers version: 4.41.0
Accelerate version: 0.30.1


In [44]:
!pip uninstall transformers accelerate


^C


In [None]:
# Créer un nouvel environnement virtuel
python -m venv new_env
# Activer l'environnement virtuel (Windows)
new_env\Scripts\activate
# Activer l'environnement virtuel (macOS/Linux)
source new_env/bin/activate

# Installer les bibliothèques dans l'environnement isolé
pip install transformers[torch]
pip install accelerate


In [48]:
print(transformers.__version__)
print(accelerate.__version__)

4.41.0
0.30.1
