In [1]:
import pandas as pd
import tensorflow as tf

In [4]:
#df = pd.read_csv('../../data_clean.csv')
df = pd.read_csv('../ranking/custom_star.csv')
df = df.loc[df['custom_star'] == 3]

## Create Dataset

In [6]:
def train_test_split(dataframe, ratio):
  # test_length = int(df.shape[0] * ratio)
  # test = dataframe['commentaire'][0:test_length]
  # train = dataframe['commentaire'][test_length:]
  # return train, test

  # Shuffle
  size = dataframe.shape[0]
  test_length = int(size * ratio)
  shuffled_data = dataframe.sample(size)

  # Split
  test = shuffled_data['commentaire'][0:test_length]
  train = shuffled_data['commentaire'][test_length:]
  return train, test


def build_text_files(data_list, dest_path):
  # f = open(dest_path, 'w')
  # f.write(' '.join(data_list))
  processed_list = list(map(lambda x: x.strip(), data_list ))
  with open(dest_path, 'w') as f:
    for comment in processed_list:
      f.write(comment + '\n')

train, test = train_test_split(df, ratio=0.15) 


build_text_files(train,'train_dataset.txt')
build_text_files(test,'test_dataset.txt')

print("Train dataset length: "+str(len(train)))
print("Test dataset length: "+ str(len(test)))

Train dataset length: 2553
Test dataset length: 450


In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("antoiloui/belgpt2")

train_path = 'train_dataset.txt'
test_path = 'test_dataset.txt'

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
from transformers import TextDataset,DataCollatorForLanguageModeling

def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)
     
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)



In [9]:
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead

model = AutoModelWithLMHead.from_pretrained("antoiloui/belgpt2")


training_args = TrainingArguments(
    output_dir="./gpt2-custom-score-3", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=64,  # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps=800, # after # steps model is saved 
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
    )


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)



In [20]:
trainer.train()

***** Running training *****
  Num examples = 994
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 96
100%|██████████| 96/96 [57:43<00:00, 26.93s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 96/96 [57:43<00:00, 36.08s/it]

{'train_runtime': 3463.844, 'train_samples_per_second': 0.861, 'train_steps_per_second': 0.028, 'train_loss': 2.4611204465230307, 'epoch': 3.0}





TrainOutput(global_step=96, training_loss=2.4611204465230307, metrics={'train_runtime': 3463.844, 'train_samples_per_second': 0.861, 'train_steps_per_second': 0.028, 'train_loss': 2.4611204465230307, 'epoch': 3.0})

In [21]:
trainer.save_model()

Saving model checkpoint to ./gpt2-custom-score-3
Configuration saved in ./gpt2-custom-score-3/config.json
Model weights saved in ./gpt2-custom-score-3/pytorch_model.bin


In [7]:
from transformers import pipeline

elcommentator = pipeline('text-generation', model='./gpt2-comments', tokenizer='antoiloui/belgpt2')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
elcommentator('Je ne suis')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Je ne suis pas une élève sérieuse et attentive. Un bon trimestre. Un bon trimestre. Un bon'}]