# Amazon Fashion Binary

Il progetto è volto allo sviluppo di un modello ML per svolgere sentiment analysis su recensioni di utenti amazon al fine di classificarle come positive o negative.

In [43]:
import tensorflow as tf
from tensorflow import keras
from keras import models, layers

## Dataset
Il dataset è composto da 20000 recensioni positive e 20000 recensioni negative. Verranno provate varie configurazioni del dataset: la prima consiste nel dataset completo senza apportare nessuna modifica alle recensioni, nella seconda le recensioni verranno modificate in base alla loro lunghezza e alla dimensione del vocabolario.

In [44]:
#path della directory deel dataset
directory_path = "./reviews"

Il dataset viene dapprima diviso in training set (70%), validation set (15%) e test set (15%)

In [45]:
train_dataset, test_validation_dataset = tf.keras.preprocessing.text_dataset_from_directory(
    directory_path,
    labels='inferred',
    label_mode='int',
    class_names=None,
    batch_size=32,
    max_length=None,
    shuffle=True,
    seed=42,
    validation_split=0.3,
    subset="both",
    follow_links=False,
    verbose=True
)
test_dataset, validation_dataset = tf.keras.utils.split_dataset(
    test_validation_dataset, left_size=0.5, shuffle=True, seed=42
)
X_train = train_dataset.map(lambda x, y: x)
X_test = test_dataset.map(lambda x, y: x)
X_validation = validation_dataset.map(lambda x, y: x)

Found 40000 files belonging to 2 classes.
Using 28000 files for training.
Using 12000 files for validation.


Vengono calcolate la media e la varianza della lunghezza delle recensioni: questi dati verranno utilizzati per scegliere le configurazioni del dataset su cui il modello opererà.

In [56]:
#lunghezza media delle recensioni
DATASET_SIZE = 40000
len_strings = []

for batch in X_train:
    n = batch.numpy()
    for data in n:
        split = data.split()
        len_strings.append(len(split))

for batch in X_test:
    n = batch.numpy()
    for data in n:
        split = data.split()
        len_strings.append(len(split))

for batch in X_validation:
    n = batch.numpy()
    for data in n:
        split = data.split()
        len_strings.append(len(split))

len_mean = sum(len_strings) / DATASET_SIZE
len_variance = sum((len_text - len_mean) ** 2 for len_text in len_strings) / DATASET_SIZE

print("Media: " + str(len_mean) + "\nVarianza: " + str(len_variance))

Media: 33.1579
Varianza: 1501.74106759


# Prima configurazione

Nella tokenizzazione delle recensione vengono scelte una size per il vocabolario pari a 30 e una lunghezza massima delle recensioni di 20 (sotto la media).

In [47]:
MAX_SEQUENCE_LENGTH = 20
VOCAB_SIZE = 30
vectorizationLayer = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=MAX_SEQUENCE_LENGTH)
vectorizationLayer.adapt(X_train)

In [48]:
model = models.Sequential()

In [49]:
# Input - Layer
model.add(vectorizationLayer)
# Hidden - Layers
model.add(layers.Dense(50, activation = "relu"))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(50, activation = "relu"))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(50, activation = "relu"))
# Output- Layer
model.add(layers.Dense(1, activation = "sigmoid"))
model.summary()

In [50]:
model.compile(
optimizer = "adam",
loss = "binary_crossentropy",
metrics = ["accuracy"]
)

In [51]:
results = model.fit(
train_dataset,
epochs= 20,
batch_size = 32,
validation_data = validation_dataset
)

Epoch 1/20
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 7ms/step - accuracy: 0.5111 - loss: 1.3715 - val_accuracy: 0.5446 - val_loss: 0.6904
Epoch 2/20
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step - accuracy: 0.5450 - loss: 0.7135 - val_accuracy: 0.5369 - val_loss: 0.6854
Epoch 3/20
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - accuracy: 0.5482 - loss: 0.6902 - val_accuracy: 0.5237 - val_loss: 0.6826
Epoch 4/20
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - accuracy: 0.5575 - loss: 0.6829 - val_accuracy: 0.5460 - val_loss: 0.6792
Epoch 5/20
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - accuracy: 0.5576 - loss: 0.6825 - val_accuracy: 0.5613 - val_loss: 0.6764
Epoch 6/20
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step - accuracy: 0.5620 - loss: 0.6782 - val_accuracy: 0.5625 - val_loss: 0.6759
Epoch 7/20
[1m875/875[0m 

In [52]:
loss, accuracy = model.evaluate(test_dataset)
print(f"Loss: {loss}, Accuracy: {accuracy}")

[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5811 - loss: 0.6697
Loss: 0.6683364510536194, Accuracy: 0.5822805762290955


## Seconda configurazione

Nella tokenizzazione delle recensione vengono scelte una size per il vocabolario pari a 50 e una lunghezza massima delle recensioni di 35 (circa la media).


In [None]:
MAX_SEQUENCE_LENGTH = 35
VOCAB_SIZE = 50
vectorizationLayer = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=MAX_SEQUENCE_LENGTH)
vectorizationLayer.adapt(X_train)

In [None]:
model = models.Sequential()

In [None]:
# Input - Layer
model.add(vectorizationLayer)
# Hidden - Layers
model.add(layers.Dense(50, activation = "relu"))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(50, activation = "relu"))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(50, activation = "relu"))
# Output- Layer
model.add(layers.Dense(1, activation = "sigmoid"))
model.summary()

In [None]:
model.compile(
optimizer = "adam",
loss = "binary_crossentropy",
metrics = ["accuracy"]
)

In [None]:
results = model.fit(
train_dataset,
epochs= 20,
batch_size = 32,
validation_data = validation_dataset
)

In [None]:
loss, accuracy = model.evaluate(test_dataset)
print(f"Loss: {loss}, Accuracy: {accuracy}")

## Terza configurazione

Nella tokenizzazione delle recensione vengono scelte una size per il vocabolario pari a 100 e una lunghezza massima delle recensioni di 70 (sopra la media).


In [None]:
MAX_SEQUENCE_LENGTH = 70
VOCAB_SIZE = 100
vectorizationLayer = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=MAX_SEQUENCE_LENGTH)
vectorizationLayer.adapt(X_train)

In [None]:
model = models.Sequential()

In [None]:
# Input - Layer
model.add(vectorizationLayer)
# Hidden - Layers
model.add(layers.Dense(50, activation = "relu"))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(50, activation = "relu"))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(50, activation = "relu"))
# Output- Layer
model.add(layers.Dense(1, activation = "sigmoid"))
model.summary()

In [None]:
model.compile(
optimizer = "adam",
loss = "binary_crossentropy",
metrics = ["accuracy"]
)

In [None]:
results = model.fit(
train_dataset,
epochs= 20,
batch_size = 32,
validation_data = validation_dataset
)

In [None]:
loss, accuracy = model.evaluate(test_dataset)
print(f"Loss: {loss}, Accuracy: {accuracy}")