In [1]:
import os

def load_text_files(folder_path):
    texts = []
    total_length = 0 
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if os.path.isfile(file_path):
            with open(file_path, "r", encoding="utf-8") as file:
                content = file.read()
                texts.append(content)
                total_length += len(content)
    return texts, total_length

data_path = "dataset\\fr"
texts, total_length = load_text_files(data_path) 
print(f"total_length: {total_length}")
print(f"Loaded {len(texts)} files from {data_path}")


total_length: 643863
Loaded 579 files from dataset\fr


In [2]:
from datasets import Dataset

dataset = Dataset.from_dict({"text": texts})
print(dataset)
print(dataset[1])


  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['text'],
    num_rows: 579
})
{'text': "S'incrivant dans la tendance au réchauffement à long terme enregistrée depuis l'époque préindustrielle, la température moyenne à la surface du globe observée pour la décennie 2006-2015 a été supérieure de 0,87 °C (avec une fourchette probable comprise entre 0,75 °C et 0,99 °C) à la température moyenne pour la période 1850-1900 (degré de confiance très élevé). Le réchauffement planétaire anthropique estimé correspond au niveau de réchauffement observé à ± 20 % près (fourchette probable) et augmente actuellement de 0,2 °C (fourchette probable comprise entre 0,1 °C et 0,3 °C) par décennie sous l'effet des émissions passées et présentes (degré de confiance élevé)."}


In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="longest")

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

print(tokenized_dataset)

Map: 100%|██████████| 579/579 [00:00<00:00, 4135.94 examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 579
})





In [4]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)


In [5]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)  # 90% train, 10% validation
train_dataset = tokenized_dataset["train"]
eval_dataset = tokenized_dataset["test"]

In [6]:
from transformers import AutoModelForMaskedLM, EarlyStoppingCallback, TrainingArguments, Trainer
from torch import cuda

model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-base")

training_args = TrainingArguments(
    output_dir="./results",                          # Output directory for model checkpoints and logs
    per_device_train_batch_size=8,                   # Batch size per device during training
    num_train_epochs=100,                            # Number of training epochs
    eval_strategy="epoch",                           # Evaluation strategy to run at each epoch
    logging_dir="./logs",                            # Directory for logging
    save_strategy="epoch",                           # Save model after each epoch
    load_best_model_at_end=True,                     # Load the best model when training finishes
    metric_for_best_model="eval_loss",               # Metric to use for selecting the best model
    logging_strategy="epoch",                        # Log training information at each epoch
)

print(f"Using device: {cuda.get_device_name(0) if cuda.is_available() else 'cpu'}")

early_stopping = EarlyStoppingCallback(early_stopping_patience=6)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,                  
    eval_dataset=eval_dataset,                    
    tokenizer=tokenizer,                          
    data_collator=data_collator,                   
    callbacks=[early_stopping],                   
)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  trainer = Trainer(


Using device: NVIDIA GeForce RTX 4060 Laptop GPU


In [7]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.8708,0.589105
2,0.7726,0.607409
3,0.714,0.672737
4,0.678,0.648244
5,0.6431,0.628716
6,0.5962,0.598188
7,0.5866,0.677046


There were missing keys in the checkpoint model loaded: ['lm_head.decoder.weight', 'lm_head.decoder.bias'].


TrainOutput(global_step=462, training_loss=0.6944627472848603, metrics={'train_runtime': 3888.2085, 'train_samples_per_second': 13.399, 'train_steps_per_second': 1.697, 'total_flos': 962366914897920.0, 'train_loss': 0.6944627472848603, 'epoch': 7.0})

In [8]:
output_dir = "./climate_model"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")


Model and tokenizer saved to ./climate_model


In [9]:
from transformers import AutoModelForMaskedLM, AutoTokenizer  

model = AutoModelForMaskedLM.from_pretrained(output_dir)  
tokenizer = AutoTokenizer.from_pretrained(output_dir)

print("Model and tokenizer loaded successfully!")


Model and tokenizer loaded successfully!


In [10]:
text = "Climate change is a significant global issue."

inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)

outputs = model(**inputs)

print(outputs)


MaskedLMOutput(loss=None, logits=tensor([[[ 6.1843e+01, -4.4388e-02,  3.7883e+01,  ...,  2.4577e+01,
           1.3972e+01,  1.9859e+01],
         [ 1.2511e+01, -9.6129e-01,  4.2237e+01,  ...,  1.9753e+01,
           1.0815e+01,  2.1118e+01],
         [ 2.0193e+01, -1.5205e+00,  6.0594e+01,  ...,  4.9096e+01,
           1.7803e+01,  3.3202e+01],
         ...,
         [ 2.0529e+01, -1.1429e+00,  4.6200e+01,  ...,  3.5253e+01,
           1.2640e+01,  2.5840e+01],
         [ 3.1179e+01, -1.2390e+00,  6.4606e+01,  ...,  4.2863e+01,
           1.8936e+01,  3.5799e+01],
         [ 3.5556e+01, -3.1919e-01,  5.0570e+01,  ...,  3.1470e+01,
           1.6240e+01,  2.4642e+01]]], grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)
