In [None]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np # to use numpy arrays instead of lists
import pandas as pd # DataFrame (table)
import matplotlib.pyplot as plt # to plot
from sklearn.model_selection import train_test_split

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten
from tensorflow.keras.layers import Conv1D, AveragePooling1D
from tensorflow.keras.callbacks import EarlyStopping
!pip install keras-tuner # Install keras-tuner
import keras_tuner as kt # Now import keras_tuner

In [None]:
from sklearn.metrics import classification_report
from google.colab import drive
drive.mount('gdrive')

In [None]:
def leeDatos():

    dataSet = pd.read_csv("gdrive/MyDrive/Colab Notebooks/archive1.zip", header=0, compression='zip')

    return dataSet

In [None]:
def encodeData(dataSet=0):
    dataSet.dropna(inplace=True)

    # Remove any 'neutral' ratings
    dataSet['sentiment'] = ["negative" if x == "neutral" else x for x in dataSet['sentiment']]
    dataSet['sent_analysis'] = np.where(dataSet['sentiment'] == "positive", 1, 0)
    datos = ['text', 'sent_analysis']
    misDatos = dataSet[datos]

    return misDatos

In [None]:
datos = leeDatos()
print(datos)
print("\n")

encodeDatos = encodeData(datos)
print(encodeDatos)
print("\n")
print(encodeDatos['sent_analysis'].mean())
print("\n")

In [None]:
ts_size = .25
trainSet, testSet = train_test_split(encodeDatos, test_size=ts_size, random_state=0)
print(trainSet)
print('trainSet shape: ', trainSet.shape)
print(testSet)
print('testSet shape: ', testSet.shape)
print("\n")

In [None]:
vocab_size = 15000
max_length = 120
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

training_sentences= trainSet['text']
training_labels = trainSet['sent_analysis']
testing_sentences = testSet['text']
testing_labels = testSet['sent_analysis']

In [None]:
tokenizer =

"""

MISSING CODE HERE

Complete the tokenizer

"""

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [None]:
def model_builder(hp):
    vocab_size = 15000
    hp_embedding_dim = hp.Int('embedding_dim', min_value=4, max_value=48 , step=4)
    hp_max_length = hp.Int('input_length', min_value=60, max_value=140, step=5)
    hp_filters = hp.Int('filters', min_value=2, max_value=12, step=2)
    hp_kernel_size = hp.Int('kernel_size', min_value=2, max_value=6, step=1)
    hp_pool_size = hp.Int('pool_size', min_value=2, max_value=6, step=1)
    hp_activation_conv = hp.Choice('activation_conv', values=['linear', 'softplus', 'relu', 'sigmoid', 'tanh'])
    hp_activation_dense = hp.Choice('activation_dense', values=['linear', 'softplus', 'relu', 'sigmoid', 'tanh'])
    hp_optimizer = hp.Choice('optimizer', values=['adam', 'rmsprop'])

    model = Sequential([
                      Embedding(vocab_size, hp_embedding_dim, input_length=hp_max_length),
                      Conv1D(filters=hp_filters, kernel_size=hp_kernel_size, activation=hp_activation_conv),
                      AveragePooling1D(pool_size=hp_pool_size),
                      Flatten(),
                      Dense(2, activation=hp_activation_dense),
                      Dense(1, activation='sigmoid', name="Outpul_layer")
                      ])

    opt = hp_optimizer


    """

    MISSING CODE HERE

    Complete the model_builder function

    """


    return model

In [None]:
tuner = kt.Hyperband(model_builder,
                     objective='val_accuracy',
                     max_epochs=18,
                     factor=3,
                     )

In [None]:
stop_early = EarlyStopping(monitor='val_loss', patience=5)

In [None]:
validation_split_size = 0.35
num_epochs = 45

tuner.search(training_padded,
             training_labels,
             validation_split=validation_split_size,
             epochs=num_epochs,
             callbacks=[stop_early],
             verbose=2)

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""The hyperparameter search is complete.
The best 'input_length' for the embedding layer is {best_hps.get('input_length')}
""")

In [None]:
model =

"""

MISSING CODE HERE

Complete the model creation

"""

In [None]:
fig = plt.figure()
fig.add_subplot(121)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title("Loss vs Epochs")
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Training', 'Validation'], loc='upper right')
fig.add_subplot(122)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title("Accuracy vs Epochs")
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Training', 'Validation'], loc='upper right')
plt.show()

In [None]:
print(model.evaluate(training_padded, training_labels, verbose=2))
print("\n")

#### predict train data ######
df_train = pd.DataFrame()
df_train['real'] = training_labels
df_train['pred'] = model.predict(training_padded).reshape(1,len(training_padded))[0]
print(round(df_train))
print(model.evaluate(testing_padded,testing_labels, verbose=2))
print("\n")

# test data
df_test = pd.DataFrame()
df_test['real'] = testing_labels
df_test['pred'] = model.predict(testing_padded).reshape(1,len(testing_padded))[0]
print(round(df_test))

report = classification_report(testing_labels, round(df_test['pred']), output_dict=True)
df_report = pd.DataFrame(report).transpose()
print(df_report)
print("\n")