In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%tensorflow_version 2.x
import tensorflow as tf
import tempfile
import seaborn as sns
import itertools
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
# aloternatively you could do: import sklearn.metrics as metrics
from google.colab import widgets
# For facets
from IPython.core.display import display, HTML
import base64

from keras.datasets import imdb
import numpy as np

print('Modules are imported.')

# 3.4.1 The IMDB Dataset

database_dim = 10000

# importing the database
[(train_data, train_labels), (test_data, test_labels)] = imdb.load_data(num_words=database_dim)

# re-shufling the data:
idx = np.random.permutation(train_data.shape[0])
# print(idx)
train_data = train_data[idx]
train_labels = train_labels[idx]

idx = np.random.permutation(test_data.shape[0])
# print(idx)
test_data = test_data[idx]
test_labels = test_labels[idx]

display(train_data)
display(train_labels)
display(train_data.shape)

# Getting the max value for each row and then the max of all:
max([max(sequence) for sequence in train_data])

# Converting the first review [0] into text:
word_index = imdb.get_word_index()
rev_word_idx = dict( [ (value, key) for (key, value) in word_index.items()] )
decoded_review = ' '.join( [rev_word_idx.get(i-3, '?') for i in train_data[0]] )   # non-found words are replaced by ?
print(decoded_review)

TensorFlow 2.x selected.


Using TensorFlow backend.


Modules are imported.


array([list([1, 449, 3214, 137, 8757, 1313, 5, 4, 1366, 33, 4490, 270, 162, 2, 19, 4, 5405, 123, 2718, 14, 996, 3605, 33, 175, 651, 2994, 8757, 9, 1201, 11, 14, 33, 32, 5, 191, 30, 7195, 18, 14, 2152, 5174, 12, 615, 9, 55, 55, 78, 137, 981, 8, 30, 6, 993, 931, 23, 2997, 33, 4, 58, 7, 45, 766, 6086, 12, 131, 1287, 1082, 163, 42, 2, 8499, 4, 7352, 26, 32, 96, 99, 196, 5, 101, 5940, 1488, 36, 203, 28, 69, 9, 416, 17, 507, 32, 1309, 46, 8, 4, 213, 7, 601, 3262, 14, 9, 5331, 17, 2, 20, 2016, 5, 146, 3521, 8, 135, 15, 285, 29, 122, 103, 14, 2, 12, 798, 60, 48, 1997]),
       list([1, 323, 675, 2324, 314, 2553, 314, 2553, 1972, 2710, 314, 8582, 1972, 10, 10, 592, 2, 7918, 7577, 9, 3208, 46, 23, 27, 7033, 11, 4749, 103, 3718, 8, 1839, 6, 2839, 4270, 95, 240, 6752, 34, 4, 5747, 174, 8, 2106, 8, 1316, 8, 28, 160, 140, 27, 2401, 47, 77, 2, 50, 5, 9, 467, 1185, 568, 7525, 21, 36, 92, 181, 90, 1533, 8, 1839, 68, 132, 36, 181, 90, 623, 46, 32, 271, 73, 21, 95, 4, 1973, 214, 2, 5, 54, 6, 6492, 568, 2

array([0, 0, 0, ..., 0, 0, 1])

(25000,)

? oh dear while chevy chase and the gang at snl set new ? with the sketch show format this fails miserably at every level fortunately chevy is barely in this at all and can't be blamed for this utter tripe it seriously is very very bad while meant to be a political comment on usa at the time of it's release 1974 it still remains neither funny or ? observed the sketches are all way too long and any satirical impact they may have had is lost as they're all drawn out to the point of complete boredom this is credited as ? movie debut and i'm pleased to say that everything he did after this ? it avoid even if curious


In [4]:
# 3.4.2 Preparing the data

# 3.2 Vectorising the data:

# imported data is in lists. we need to convert it into tensors so TF can read them.
# We'll convert the samples containing integers into one-hot rows (with 0s and 1s only on the integer positions.)
# done manually:
def vectorise_sequences(sequences, dimension=10000):
  results = np.zeros( (sequences.shape[0], dimension) )
  # for i in range(0,sequences.shape[0],1):
  #   results[i][sequences[i][:]] = 1
  #   # print(sequences[i][:])
  # alternative:
  for i, seq in enumerate(sequences,0):   # enumarate gets the values of all the columns for each row
    # print(i)
    # print(seq)
    results[i][seq] = 1
    # print(sequences[i][:])
  
  return results

x_train = vectorise_sequences(train_data, dimension=database_dim)
x_test = vectorise_sequences(test_data, dimension=database_dim)

display(x_train)
display(x_test)

# Vectorising (converting to tensor) the labels:
y_train = train_labels.astype('float32')
y_test = test_labels.astype('float32')

display(y_train)
display(y_test)

array([[0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 1.],
       [0., 1., 1., ..., 0., 0., 0.]])

array([[0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.]])

array([0., 0., 0., ..., 0., 0., 1.], dtype=float32)

array([1., 1., 1., ..., 1., 0., 1.], dtype=float32)