In [46]:
import tensorflow as tf
from tensorflow.keras import utils
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from sklearn import svm
from tensorflow.keras import layers
from tensorflow.keras import losses
import os

In [13]:
#chargement des donnees d'entrainement, de test et de validation
batch_size = 32
seed = 42
raw_train_ds = preprocessing.text_dataset_from_directory(
    "./datas/train",
    batch_size=100,
    validation_split=0.2,
    subset='training',
    seed=seed)
raw_val_ds = preprocessing.text_dataset_from_directory(
    "./datas/train",
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed)
raw_test_ds = preprocessing.text_dataset_from_directory(
    "./datas/test", batch_size=batch_size)

Found 4000 files belonging to 2 classes.
Using 3200 files for training.
Found 4000 files belonging to 2 classes.
Using 800 files for validation.
Found 1574 files belonging to 2 classes.


In [37]:
#creation de la couche de vectorisaton des donnees
VOCAB_SIZE = 10000
int_vectorize_layer = TextVectorization(max_tokens=VOCAB_SIZE, output_mode='int')
#recuperation des donne sans label
train_text = raw_train_ds.map(lambda text, labels: text)
#creation des index de chaine vers les entiers 
int_vectorize_layer.adapt(train_text)
def int_vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return int_vectorize_layer(text), label

In [15]:
#application de la couche de vectorisation aux donnees de train, validation et de test
int_train_ds = raw_train_ds.map(int_vectorize_text)
int_val_ds = raw_val_ds.map(int_vectorize_text)
int_test_ds = raw_test_ds.map(int_vectorize_text)


In [18]:
# optimisation des performances 
AUTOTUNE = tf.data.AUTOTUNE

def configure_dataset(dataset):
      return dataset.cache().prefetch(buffer_size=AUTOTUNE)
int_train_ds = configure_dataset(int_train_ds)
int_val_ds = configure_dataset(int_val_ds)
int_test_ds = configure_dataset(int_test_ds)

In [32]:
x_train = list()
y_train = list()
x_val = list()
y_val = list()
x_test = list()
y_test = list()
for i, j in int_train_ds:
    x_train.append(i.numpy())
    y_train.append(j.numpy())
for i, j in int_val_ds:
    x_val.append(i.numpy())
    y_val.append(j.numpy())
for i, j in int_test_ds:
    x_test.append(i.numpy())
    y_test.append(j.numpy())

In [42]:
def create_model(vocab_size, num_labels):
    model = tf.keras.Sequential([
          layers.Embedding(vocab_size, 64, mask_zero=True),
          layers.Conv1D(64, 5, padding="valid", activation="relu", strides=2),
          layers.GlobalMaxPooling1D(),
          layers.Dense(num_labels)
      ])
    return model

In [48]:
int_model = create_model(vocab_size=VOCAB_SIZE + 1, num_labels=4)
int_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy'])
history = int_model.fit(int_train_ds, validation_data=int_val_ds, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [50]:
print(int_model.summary())
int_loss, int_accuracy = int_model.evaluate(int_test_ds)


Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 64)          640064    
_________________________________________________________________
conv1d_2 (Conv1D)            (None, None, 64)          20544     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 260       
Total params: 660,868
Trainable params: 660,868
Non-trainable params: 0
_________________________________________________________________
None
