In [29]:
import pandas as pd
import tensorflow as tf
import numpy as np

In [113]:
df = pd.read_csv('../../data_clean.csv')

In [114]:
# No nan values
df.isna().any()

eleve          False
classe         False
min            False
max            False
commentaire    False
dtype: bool

In [115]:
# Remove non numericals Abs, Disp etc.
df = df[df['eleve'].str.contains('\d+[\.,]*\d*', regex=True)]

In [116]:
df = df.astype({"eleve": 'float64'})
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7390 entries, 0 to 7492
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   eleve        7390 non-null   float64
 1   classe       7390 non-null   float64
 2   min          7390 non-null   float64
 3   max          7390 non-null   float64
 4   commentaire  7390 non-null   object 
dtypes: float64(4), object(1)
memory usage: 346.4+ KB


## Prepare Dataset

In [117]:
# Shuffle the data frame
size = df.shape[0]
df = df.sample(size)

In [118]:
df_features = df["eleve"]
df_labels = df["commentaire"]

In [119]:
df_features = np.array(df_features)
df_labels = np.array(df_labels)

In [96]:
train_dataset = tf.data.Dataset.from_tensors((
  df_features[1000:],
  df_labels[1000:]
))
test_dataset = tf.data.Dataset.from_tensors((
  df_features[0:1000],
  df_labels[0:1000]
))

In [106]:
dataset = tf.data.experimental.make_csv_dataset(
    "../../data_clean.csv", 
    batch_size=32,
    label_name="commentaire")


In [110]:
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(32)
test_dataset = test_dataset.batch(32)

In [122]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("antoiloui/belgpt2")
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [134]:
from transformers import TFGPT2LMHeadModel

model = TFGPT2LMHeadModel.from_pretrained("antoiloui/belgpt2")

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at antoiloui/belgpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [93]:
from transformers import TFTrainer, TFTrainingArguments

training_args = TFTrainingArguments(
    output_dir="./gpt2-comments", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=64,  # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps=800, # after # steps model is saved 
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
    )


trainer = TFTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at antoiloui/belgpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [129]:
model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=tf.keras.losses.MeanSquaredError())

In [135]:
model.fit(
  df_features,
  df_labels,
  batch_size=64,
  validation_split=0.2,
  epochs=3)

RuntimeError: You must compile your model before training/testing. Use `model.compile(optimizer, loss)`.

In [94]:
trainer.train()

TypeError: in user code:

    /usr/local/lib/python3.9/site-packages/transformers/trainer_tf.py:703 distributed_training_steps  *
        nb_instances_in_batch = self._compute_nb_instances(batch)
    /usr/local/lib/python3.9/site-packages/transformers/trainer_tf.py:715 _compute_nb_instances  *
        nb_instances = tf.reduce_sum(tf.cast(labels != -100, dtype=tf.int32))
    /usr/local/lib/python3.9/site-packages/tensorflow/python/util/dispatch.py:206 wrapper
        return target(*args, **kwargs)
    /usr/local/lib/python3.9/site-packages/tensorflow/python/ops/math_ops.py:1837 tensor_not_equals
        self, other = maybe_promote_tensors(self, other)
    /usr/local/lib/python3.9/site-packages/tensorflow/python/ops/math_ops.py:1202 maybe_promote_tensors
        ops.convert_to_tensor(tensor, dtype, name="x"))
    /usr/local/lib/python3.9/site-packages/tensorflow/python/profiler/trace.py:163 wrapped
        return func(*args, **kwargs)
    /usr/local/lib/python3.9/site-packages/tensorflow/python/framework/ops.py:1566 convert_to_tensor
        ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
    /usr/local/lib/python3.9/site-packages/tensorflow/python/framework/tensor_conversion_registry.py:52 _default_conversion_function
        return constant_op.constant(value, dtype, name=name)
    /usr/local/lib/python3.9/site-packages/tensorflow/python/framework/constant_op.py:264 constant
        return _constant_impl(value, dtype, shape, name, verify_shape=False,
    /usr/local/lib/python3.9/site-packages/tensorflow/python/framework/constant_op.py:281 _constant_impl
        tensor_util.make_tensor_proto(
    /usr/local/lib/python3.9/site-packages/tensorflow/python/framework/tensor_util.py:457 make_tensor_proto
        _AssertCompatible(values, dtype)
    /usr/local/lib/python3.9/site-packages/tensorflow/python/framework/tensor_util.py:336 _AssertCompatible
        raise TypeError("Expected %s, got %s of type '%s' instead." %

    TypeError: Expected string, got -100 of type 'int' instead.
