In [28]:
from datasets import Dataset
import json
from sklearn.preprocessing import LabelEncoder
from transformers import AutoModelForSequenceClassification, AutoTokenizer

data_file = 'dataset\\fr_climate-fever-dataset-r1_period_maj_opus-mt-tc-big-en-fr_v2-unicode.jsonl'  

claims = []
labels = []

label_mapping = {
    "SUPPORTS": "SUPPORTS",
    "REFUTES": "REFUTES",
    "NOT_ENOUGH_INFO": "UNDECIDED",
    "DISPUTED": "UNDECIDED"
}

with open(data_file, 'r', encoding='utf-8') as file:
    for line in file:
        example = json.loads(line)
        
        claims.append(example['claim'])
        labels.append(label_mapping[example['claim_label']])
        

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)


dataset = Dataset.from_dict({
    'claim': claims,
    'label': encoded_labels
})

In [29]:
saved_model_path = "./climate_model" 

model = AutoModelForSequenceClassification.from_pretrained(saved_model_path, num_labels=len(label_encoder.classes_))
tokenizer = AutoTokenizer.from_pretrained(saved_model_path)

print("Model and tokenizer loaded successfully.")

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ./climate_model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and tokenizer loaded successfully.


In [30]:

def tokenize_function(examples):
    return tokenizer(examples['claim'], truncation=True, padding='longest')

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 1535/1535 [00:00<00:00, 20675.86 examples/s]


In [31]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)  # 90% train, 10% validation
train_dataset = tokenized_dataset["train"]
eval_dataset = tokenized_dataset["test"]

print('train_dataset :', train_dataset)

train_dataset : Dataset({
    features: ['claim', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1381
})


In [32]:
from transformers import EarlyStoppingCallback, TrainingArguments, Trainer


training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    num_train_epochs=100, 
    eval_strategy="epoch",
    logging_dir="./logs",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    logging_strategy="epoch",
    learning_rate=1e-4,
)

from torch import cuda
print(f"Using device: {cuda.get_device_name(0) if cuda.is_available() else 'cpu'}")

early_stopping = EarlyStoppingCallback(early_stopping_patience=6)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    callbacks=[early_stopping],
)


Using device: NVIDIA GeForce RTX 4060 Laptop GPU


  trainer = Trainer(


In [33]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.0683,1.018292
2,1.0403,1.022303
3,1.0491,1.017779
4,1.0476,1.020188
5,1.0443,1.014204
6,1.0395,1.030427
7,1.0481,1.014514
8,1.0371,1.044181
9,1.047,1.017454
10,1.0425,1.025799


TrainOutput(global_step=1903, training_loss=1.0457086287481936, metrics={'train_runtime': 424.4997, 'train_samples_per_second': 325.324, 'train_steps_per_second': 40.754, 'total_flos': 983625873080004.0, 'train_loss': 1.0457086287481936, 'epoch': 11.0})

In [34]:
model_save_path = "./climate_fever_model" 
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model saved to {model_save_path}")


Model saved to ./climate_fever_model


In [35]:
import joblib
joblib.dump(label_encoder, 'label_encoder.pkl')


3
