In [1]:
import pandas as pd
import tensorflow as tf

In [2]:
df = pd.read_csv('../../data_clean.csv')

In [53]:
# Eviter les commentaires trop longs
mask = df['commentaire'].apply(len) >= 500
df = df[~mask]

## Create Dataset

In [3]:
def train_test_split(data, ratio):
  # Shuffle
  size = data.shape[0]
  test_length = int(size * ratio)
  shuffled_data = data.sample(size)

  # Split
  test = shuffled_data['commentaire'][0:test_length]
  train = shuffled_data['commentaire'][test_length:]
  return train, test

def build_text_files(data_list, dest_path):
  processed_list = list(map(lambda x: x.strip() + "<|endoftext|>", data_list ))
  with open(dest_path, 'w') as f:
    for comment in processed_list:
      f.write(comment + '\n')

train, test = train_test_split(df, ratio=0) 

build_text_files(train,'train_dataset.txt')
build_text_files(test,'test_dataset.txt')

print("Train dataset length: "+str(len(train)))
print("Test dataset length: "+ str(len(test)))

Train dataset length: 6370
Test dataset length: 1123


In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("antoiloui/belgpt2")

train_path = 'train_dataset.txt'
test_path = 'test_dataset.txt'

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [81]:
tokenizer('<|endoftext|>')

{'input_ids': [50257], 'attention_mask': [1]}

In [82]:
tokenizer.eos_token

'<|endoftext|>'

In [5]:
from transformers import TextDataset,DataCollatorForLanguageModeling

def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)
     
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)



In [6]:
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead

model = AutoModelWithLMHead.from_pretrained("antoiloui/belgpt2")


training_args = TrainingArguments(
    output_dir="./gpt2-comments", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=64,  # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps=800, # after # steps model is saved 
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
    )


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)



In [7]:
trainer.train()

***** Running training *****
  Num examples = 1414
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 135
  0%|          | 0/135 [00:00<?, ?it/s]

IndexError: index out of range in self

In [23]:
trainer.save_model()

Saving model checkpoint to ./gpt2-comments
Configuration saved in ./gpt2-comments/config.json
Model weights saved in ./gpt2-comments/pytorch_model.bin


In [25]:
from transformers import pipeline

elcommentator = pipeline('text-generation', model='./gpt2-comments', tokenizer='antoiloui/belgpt2')

loading configuration file ./gpt2-comments/config.json
Model config GPT2Config {
  "_name_or_path": "antoiloui/belgpt2",
  "_num_labels": 2,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "output_past": true,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "torch_dtype": "float32",
  "transformers_version": "4.9.1",
  "use_cache": true,
  "vocab_size": 50257
}

loading configuration file ./gpt2-comments/config.json
Model config GPT2Config {
  "_name_or_path": "antoiloui/belg

In [55]:
elcommentator('Je serai toujours ')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Je serai toujours  <name> . Un très bon trimestre. Un très bon trimestre.'}]