# Amazon Fashion Binary

Il progetto è volto allo sviluppo di un modello ML per svolgere sentiment analysis su recensioni di utenti amazon al fine di classificarle come positive o negative.

In [1]:
import tensorflow as tf
from tensorflow import keras
from keras import models, layers

## Dataset
Il dataset è composto da 20000 recensioni positive e 20000 recensioni negative. Verranno provate varie configurazioni del dataset: la prima consiste nel dataset completo senza apportare nessuna modifica alle recensioni, nella seconda le recensioni verranno modificate in base alla loro lunghezza e alla dimensione del vocabolario.

In [2]:
#path della directory deel dataset
directory_path = "./reviews"

Il dataset viene dapprima diviso in training set (70%), validation set (15%) e test set (15%)

In [3]:
train_dataset, test_validation_dataset = tf.keras.preprocessing.text_dataset_from_directory(
    directory_path,
    labels='inferred',
    label_mode='int',
    class_names=None,
    batch_size=32,
    max_length=None,
    shuffle=True,
    seed=42,
    validation_split=0.3,
    subset="both",
    follow_links=False,
    verbose=True
)
test_dataset, validation_dataset = tf.keras.utils.split_dataset(
    test_validation_dataset, left_size=0.5, shuffle=True, seed=42
)
X_train = train_dataset.map(lambda x, y: x)
X_test = test_dataset.map(lambda x, y: x)
X_validation = validation_dataset.map(lambda x, y: x)

Found 40000 files belonging to 2 classes.
Using 28000 files for training.
Using 12000 files for validation.


Vengono calcolate la media e la varianza della lunghezza delle recensioni: questi dati verranno utilizzati per scegliere le configurazioni del dataset su cui il modello opererà.

In [4]:
#lunghezza media delle recensioni
DATASET_SIZE = 40000
len_strings = []

for batch in X_train:
    n = batch.numpy()
    for data in n:
        split = data.split()
        len_strings.append(len(split))

for batch in X_test:
    n = batch.numpy()
    for data in n:
        split = data.split()
        len_strings.append(len(split))

for batch in X_validation:
    n = batch.numpy()
    for data in n:
        split = data.split()
        len_strings.append(len(split))

len_mean = sum(len_strings) / DATASET_SIZE
len_variance = sum((len_text - len_mean) ** 2 for len_text in len_strings) / DATASET_SIZE

print("Media: " + str(len_mean) + "\nVarianza: " + str(len_variance))

Media: 33.1579
Varianza: 1501.74106759


# Prima configurazione

Nella tokenizzazione delle recensione vengono scelte una size per il vocabolario pari a 30 e una lunghezza massima delle recensioni di 20 (sotto la media).

In [5]:
MAX_SEQUENCE_LENGTH = 20
VOCAB_SIZE = 30
vectorizationLayer = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=MAX_SEQUENCE_LENGTH)
vectorizationLayer.adapt(X_train)

In [6]:
model = models.Sequential()

In [7]:
# Input - Layer
model.add(vectorizationLayer)
# Hidden - Layers
model.add(layers.Dense(50, activation = "relu"))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(50, activation = "relu"))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(50, activation = "relu"))
# Output- Layer
model.add(layers.Dense(1, activation = "sigmoid"))
model.summary()

In [8]:
model.compile(
optimizer = "adam",
loss = "binary_crossentropy",
metrics = ["accuracy"]
)

In [9]:
results = model.fit(
train_dataset,
epochs= 20,
batch_size = 32,
validation_data = validation_dataset
)

Epoch 1/20
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9ms/step - accuracy: 0.5307 - loss: 0.8173 - val_accuracy: 0.5470 - val_loss: 0.6885
Epoch 2/20
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 11ms/step - accuracy: 0.5447 - loss: 0.6921 - val_accuracy: 0.5511 - val_loss: 0.6853
Epoch 3/20
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - accuracy: 0.5538 - loss: 0.6870 - val_accuracy: 0.5531 - val_loss: 0.6829
Epoch 4/20
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 10ms/step - accuracy: 0.5693 - loss: 0.6820 - val_accuracy: 0.5617 - val_loss: 0.6802
Epoch 5/20
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 10ms/step - accuracy: 0.5705 - loss: 0.6782 - val_accuracy: 0.5652 - val_loss: 0.6787
Epoch 6/20
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - accuracy: 0.5721 - loss: 0.6780 - val_accuracy: 0.5683 - val_loss: 0.6771
Epoch 7/20
[1m875/8

In [10]:
loss, accuracy = model.evaluate(train_dataset)
print(f"Training set:\nLoss: {loss}, Accuracy: {accuracy}")
loss, accuracy = model.evaluate(test_dataset)
print(f"Test set:\nLoss: {loss}, Accuracy: {accuracy}")

[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5939 - loss: 0.6671
Loss: 0.6672678589820862, Accuracy: 0.5885970592498779


## Seconda configurazione

Nella tokenizzazione delle recensione vengono scelte una size per il vocabolario pari a 50 e una lunghezza massima delle recensioni di 35 (circa la media).


In [11]:
MAX_SEQUENCE_LENGTH = 35
VOCAB_SIZE = 50
vectorizationLayer = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=MAX_SEQUENCE_LENGTH)
vectorizationLayer.adapt(X_train)

In [12]:
model = models.Sequential()

In [13]:
# Input - Layer
model.add(vectorizationLayer)
# Hidden - Layers
model.add(layers.Dense(50, activation = "relu"))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(50, activation = "relu"))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(50, activation = "relu"))
# Output- Layer
model.add(layers.Dense(1, activation = "sigmoid"))
model.summary()

In [14]:
model.compile(
optimizer = "adam",
loss = "binary_crossentropy",
metrics = ["accuracy"]
)

In [15]:
results = model.fit(
train_dataset,
epochs= 20,
batch_size = 32,
validation_data = validation_dataset
)

Epoch 1/20
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step - accuracy: 0.5267 - loss: 0.8764 - val_accuracy: 0.5485 - val_loss: 0.6811
Epoch 2/20
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - accuracy: 0.5434 - loss: 0.6953 - val_accuracy: 0.5516 - val_loss: 0.6795
Epoch 3/20
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - accuracy: 0.5638 - loss: 0.6840 - val_accuracy: 0.5585 - val_loss: 0.6769
Epoch 4/20
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 10ms/step - accuracy: 0.5638 - loss: 0.6815 - val_accuracy: 0.5720 - val_loss: 0.6768
Epoch 5/20
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 10ms/step - accuracy: 0.5719 - loss: 0.6774 - val_accuracy: 0.5765 - val_loss: 0.6741
Epoch 6/20
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 10ms/step - accuracy: 0.5797 - loss: 0.6754 - val_accuracy: 0.5724 - val_loss: 0.6738
Epoch 7/20
[1m875/875[0

In [16]:
loss, accuracy = model.evaluate(test_dataset)
print(f"Loss: {loss}, Accuracy: {accuracy}")

[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6059 - loss: 0.6631
Loss: 0.6644010543823242, Accuracy: 0.6060505509376526


## Terza configurazione

Nella tokenizzazione delle recensione vengono scelte una size per il vocabolario pari a 100 e una lunghezza massima delle recensioni di 70 (sopra la media).


In [17]:
MAX_SEQUENCE_LENGTH = 70
VOCAB_SIZE = 100
vectorizationLayer = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=MAX_SEQUENCE_LENGTH)
vectorizationLayer.adapt(X_train)

In [18]:
model = models.Sequential()

In [19]:
# Input - Layer
model.add(vectorizationLayer)
# Hidden - Layers
model.add(layers.Dense(50, activation = "relu"))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(50, activation = "relu"))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(50, activation = "relu"))
# Output- Layer
model.add(layers.Dense(1, activation = "sigmoid"))
model.summary()

In [20]:
model.compile(
optimizer = "adam",
loss = "binary_crossentropy",
metrics = ["accuracy"]
)

In [21]:
results = model.fit(
train_dataset,
epochs= 20,
batch_size = 32,
validation_data = validation_dataset
)

Epoch 1/20
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step - accuracy: 0.5244 - loss: 1.1398 - val_accuracy: 0.5421 - val_loss: 0.6854
Epoch 2/20
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - accuracy: 0.5468 - loss: 0.7051 - val_accuracy: 0.5653 - val_loss: 0.6762
Epoch 3/20
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - accuracy: 0.5481 - loss: 0.6872 - val_accuracy: 0.5662 - val_loss: 0.6763
Epoch 4/20
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - accuracy: 0.5687 - loss: 0.6795 - val_accuracy: 0.5697 - val_loss: 0.6761
Epoch 5/20
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 10ms/step - accuracy: 0.5718 - loss: 0.6779 - val_accuracy: 0.5752 - val_loss: 0.6742
Epoch 6/20
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - accuracy: 0.5778 - loss: 0.6735 - val_accuracy: 0.5725 - val_loss: 0.6744
Epoch 7/20
[1m875/875[0m 

In [22]:
loss, accuracy = model.evaluate(test_dataset)
print(f"Loss: {loss}, Accuracy: {accuracy}")

[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5926 - loss: 0.6694
Loss: 0.6711515188217163, Accuracy: 0.5866023898124695
