# Workshop 3: Sentiment anlysis
In this workshop we will learn how to train a neural network with text
as input to classify IMBD rewievs as positive or negative
([info of the data](https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)). The main blocks of the workshop are:

1. Get the data from Keras repository and visualize it.
2. Pre-process the data.
3. Design the network.
4. Train the network.
5. Evaluate the model.

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.keras.datasets import imdb
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding

# 1. Get the data from Keras repository and visualize it.

In [None]:
# Load the dataset
# Use the default parameters to keras.datasets.imdb.load_data
start_char = 1
oov_char = 2
index_from = 3
(X_train, y_train), (X_test, y_test) = imdb.load_data(start_char=start_char, oov_char=oov_char, index_from=index_from)

In [None]:
X_train[0]

In [None]:
# Retrieve the word index file mapping words to indices
word_index = imdb.get_word_index()
word_index['movie']
# Reverse the word index to obtain a dict mapping indices to words
# And add `index_from` to indices to sync with `x_train`
inverted_word_index = dict(
    (i + index_from, word) for (word, i) in word_index.items()
)
inverted_word_index
# Update `inverted_word_index` to include `start_char` and `oov_char`
inverted_word_index[start_char] = "[START]"
inverted_word_index[oov_char] = "[OOV]"

In [None]:
inverted_word_index[20]

In [None]:
# Decode the first sequence in the dataset
decoded_sequence = " ".join(inverted_word_index[i] for i in X_train[0])
decoded_sequence

In [None]:
# Count number of different words
len(inverted_word_index)

In [None]:
print("Training data: ")
print(X_train.shape)
print(y_train.shape)

In [None]:
print("Test data: ")
print(X_test.shape)
print(y_test.shape)

In [None]:
# Summarize number of classes
print("Classes: ")
print(np.unique(y_train))
print(np.unique(y_test))

In [None]:
# Evalute the class distribution
print('Class distribution')
print(np.sum(y_train == 0))
print(np.sum(y_train == 1))
print(np.sum(y_test == 0))
print(np.sum(y_test == 1))

In [None]:
# Summarize review length
print("Review length: ")
result = [len(x) for x in X_train]
print("Mean %.2f words (%f)" % (np.mean(result), np.std(result)))
# plot review length
plt.boxplot(result)
plt.show()

In [None]:
# Summarize review length
print("Review length: ")
result = [len(x) for x in X_test]
print("Mean %.2f words (%f)" % (np.mean(result), np.std(result)))
# plot review length
plt.boxplot(result)
plt.show()

# 2. Pre-process the data.

In [None]:
# Re-load the dataset just with the 5000 most common words
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

In [None]:
# Set the length of the senteces to a fixed size
max_words = 500
X_train = pad_sequences(X_train, maxlen=max_words)
X_test = pad_sequences(X_test, maxlen=max_words)

In [None]:
# Summarize review length
print("Review length: ")
result = [len(x) for x in X_train]
print("Mean %.2f words (%f)" % (np.mean(result), np.std(result)))
# plot review length
plt.boxplot(result)
plt.show()

# 3. Design the network.

In [None]:
# Create the model
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
# Show a summary of the model
model.summary()

In [None]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# 4. Train the network.

In [None]:
# Fit the model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=128, verbose=2)

In [None]:
# Plot training and validation accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper left')
# plt.ylim(0, 1)
plt.show()

# 5. Evaluate the model.

In [None]:
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

# Exercice 1: Use the learning of the lesson to reduce de overfitting

In [None]:
# Re-load the dataset just with the 5000 most common words


In [None]:
# Set the length of the senteces to a fixed size


In [None]:
# Create the model


In [None]:
# Show a summary of the model


In [None]:
# Compile the model


In [None]:
# Fit the model


In [None]:
# Plot training and validation accuracy
