# ML basics with keras

## Basic classification: Classify images of clothing

In [None]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

In [None]:
fashion_mnist = keras.datasets.fashion_mnist
(train_img, train_label), (test_img, test_label) = fashion_mnist.load_data()

* img = 28*28. pixel from 0 to 255
* label is an array of integers, ranging from 0 to 9
* train_img and train_label are arrays
* test_img and test_label are arrays

* Label	Class
* 0	T-shirt/top
* 1	Trouser
* 2	Pullover
* 3	Dress
* 4	Coat
* 5	Sandal
* 6	Shirt
* 7	Sneaker
* 8	Bag
* 9	Ankle boot

In [None]:
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 
               'Dress', 'Coat', 'Sandal', 
               'Shirt', 'Sneaker', 'Bag', 
               'Ankle boot']

In [None]:
print(train_img.shape)
print(test_label.shape)
print(test_img.shape)
print(test_label.shape)

In [None]:
print(len(train_img))
print(len(test_label))
print(len(test_img))
print(len(test_label))

In [None]:
print(train_label)
print(test_label)

In [None]:
print(set(train_label))
print(set(test_label))

In [None]:
plt.figure()
plt.imshow(train_img[0])
plt.colorbar()
plt.show()

In [None]:
train_img[0]

In [None]:
train_img = train_img / 255
test_img = test_img / 255

In [None]:
train_img[0]

In [None]:
plt.figure(figsize=(10, 10))
for i in range(30):
    plt.subplot(5, 6, i + 1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(train_img[i], cmap=plt.cm.binary)
    plt.xlabel(class_names[train_label[i]])
plt.show()

In [None]:
model = keras.Sequential([
                          keras.layers.Flatten(input_shape=(28, 28)),
                          keras.layers.Dense(128, activation='relu'),
                          keras.layers.Dense(10)
])

* flatten layer transforms from a 2 dim array 28*28 to 1 dim array 28*28 = 784 pixel
* flatten layer unstacks rows of pixels in the img and liné them up
* flatten layer has no parameter to learn, only reformats the data
*  dense layer caculates output = activation(dot(input, kernel) + bias))
* 1st dense layer has 128 nodes (neurons)
* 2nd has a logit array with length of 10
* each node contains a score that indicates the current image belongs to one of the 10 classes

* Before train, compile model by
* loss function
* optimizer
* metric. 'accuracy' is used to measure the fraction of the image that are correctly classified

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

* Training model requires the following steps:
* feed training data to the model

In [None]:
model.fit(train_img, train_label, epochs=5)

In [None]:
test_loss, test_acc = model.evaluate(test_img, test_label, verbose=2)
print('\nTest acc', test_acc)

In [None]:
probability_model = tf.keras.Sequential([
                                         model,
                                         tf.keras.layers.Softmax()
])

In [None]:
prediction = probability_model.predict(test_img)

In [None]:
prediction[0]

In [None]:
max(prediction[0])

In [None]:
np.argmax(prediction[0])  # return the indice of maximum value

In [None]:
test_label[0]

In [None]:
def plot_image(i, predictions_array, true_label, img):
  true_label, img = true_label[i], img[i]
  plt.grid(False)
  plt.xticks([])
  plt.yticks([])

  plt.imshow(img, cmap=plt.cm.binary)

  predicted_label = np.argmax(predictions_array)
  if predicted_label == true_label:
    color = 'blue'
  else:
    color = 'red'

  plt.xlabel("{} {:2.0f}% ({})".format(class_names[predicted_label],
                                100*np.max(predictions_array),
                                class_names[true_label]),
                                color=color)

def plot_value_array(i, predictions_array, true_label):
  true_label = true_label[i]
  plt.grid(False)
  plt.xticks(range(10))
  plt.yticks([])
  thisplot = plt.bar(range(10), predictions_array, color="#777777")
  plt.ylim([0, 1])
  predicted_label = np.argmax(predictions_array)

  thisplot[predicted_label].set_color('red')
  thisplot[true_label].set_color('blue')

In [None]:
i = 0
plt.figure(figsize=(6,3))
plt.subplot(1,2,1)
plot_image(i, prediction[i], test_label, test_img)
plt.subplot(1,2,2)
plot_value_array(i, prediction[i],  test_label)
plt.show()

In [None]:
i = 12
plt.figure(figsize=(6,3))
plt.subplot(1,2,1)
plot_image(i, prediction[i], test_label, test_img)
plt.subplot(1,2,2)
plot_value_array(i, prediction[i],  test_label)
plt.show()

In [None]:
num_rows = 5
num_cols = 3
num_images = num_rows*num_cols
plt.figure(figsize=(2*2*num_cols, 2*num_rows))
for i in range(num_images):
  plt.subplot(num_rows, 2*num_cols, 2*i+1)
  plot_image(i, prediction[i], test_label, test_img)
  plt.subplot(num_rows, 2*num_cols, 2*i+2)
  plot_value_array(i, prediction[i], test_label)
plt.tight_layout()
plt.show()

In [None]:
img = test_img[1]
img.shape

In [None]:
img = (np.expand_dims(img, 0))
img.shape

In [None]:
prediction_single = probability_model(img)
prediction_single

In [None]:
plot_value_array(1, prediction_single[0], test_label)
_ = plt.xticks(range(10), class_names, rotation=45)

In [None]:
np.argmax(prediction_single[0])

## Basic text classification

In [12]:
import tensorflow as tf
import os
import shutil

In [10]:
tf.__version__

'2.4.0-dev20200913'

* use the Large Movie Review Dataset that contains the text of 50,000 movie reviews from the Internet Movie Database. 
* These are split into 25,000 reviews for training and 25,000 reviews for testing. 
* The training and testing sets are balanced, meaning they contain an equal number of positive and negative reviews.

### Download and explore the IMDB dataset

In [2]:
# url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

# dataset = tf.keras.utils.get_file('aclImdb_v1.tar.gz', url,
#                                   untar=True, 
#                                   cache_dir='.', cache_subdir='',)

In [3]:
dataset_dir = os.path.join(os.path.dirname('/media/veec20/Data/duongdq/TF_tutorial/'), 'aclImdb')

In [4]:
os.listdir(dataset_dir)

['imdb.vocab', 'imdbEr.txt', 'README', 'test', 'train']

In [5]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

['labeledBow.feat',
 'neg',
 'pos',
 'unsup',
 'unsupBow.feat',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt']

* The aclImdb/train/pos and aclImdb/train/neg directories contain many text files
* each of which is a single movie review.

In [6]:
txt_dir = os.path.join(train_dir, 'pos')
os.listdir(txt_dir)

['0_9.txt',
 '10000_8.txt',
 '10001_10.txt',
 '10002_7.txt',
 '10003_8.txt',
 '10004_8.txt',
 '10005_7.txt',
 '10006_7.txt',
 '10007_7.txt',
 '10008_7.txt',
 '10009_9.txt',
 '1000_8.txt',
 '10010_7.txt',
 '10011_9.txt',
 '10012_8.txt',
 '10013_7.txt',
 '10014_8.txt',
 '10015_8.txt',
 '10016_8.txt',
 '10017_9.txt',
 '10018_8.txt',
 '10019_8.txt',
 '1001_8.txt',
 '10020_8.txt',
 '11_9.txt',
 '12000_8.txt',
 '12001_8.txt',
 '12002_10.txt',
 '12003_10.txt',
 '12004_10.txt',
 '12005_10.txt',
 '12006_10.txt',
 '12007_10.txt',
 '12008_8.txt',
 '12009_10.txt',
 '1200_10.txt',
 '12010_10.txt',
 '12011_10.txt',
 '12012_10.txt',
 '12013_10.txt',
 '12014_10.txt',
 '12015_10.txt',
 '12016_7.txt',
 '12017_7.txt',
 '12018_7.txt',
 '12019_8.txt',
 '1201_8.txt',
 '12020_7.txt',
 '12021_9.txt',
 '12022_10.txt',
 '12023_9.txt',
 '12024_7.txt',
 '12025_7.txt',
 '12026_8.txt',
 '12027_10.txt',
 '12028_10.txt',
 '12029_10.txt',
 '8200_8.txt',
 '8201_8.txt',
 '8202_10.txt',
 '8203_7.txt',
 '8204_8.txt',
 '82

In [7]:
sample_file = os.path.join(train_dir, 'pos/12346_10.txt')
with open(sample_file) as f:
  print(f.read())

This is a very fine and poetic story. Beautiful scenery. Magnificent music score. I've been twice in Japan last year and the movie gave me this typical Japanese feeling. The movement of the camera is superb, as well as the actors. It goes deep into your feelings without becoming melodramatic. Japanese people are very sensitive and kind and it's all very well brought onto the screen here. The director is playing superb with light an colors and shows the audience that it is also possible to let them enjoy a movie with subtle and fine details. Once you've seen this movie you will want to see more from the same director. It's a real feel good movie and I can only recommend it to everybody.


### Load the dataset

* remove unsup folder

In [13]:
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

* use validation_split argument to create a validation set 

In [14]:
batch_size = 32
seed = 42

raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    '/media/veec20/Data/duongdq/TF_tutorial/aclImdb/train',
    batch_size = batch_size,
    validation_split = 0.2,
    subset='training',
    seed = seed
)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.


* iterate over the dataset and print out a few samples as follows

In [21]:
for text_batch, label_batch in raw_train_ds.take(1):
    for i in range(3):
        print('review', text_batch.numpy()[i])
        print('label', label_batch.numpy()[i])

review b'Recipe for one of the worst movies of all time: a she-male villain who looks like it escaped from the WWF, has terrible aim with a gun that has inconsistent effects (the first guy she shoots catches on fire but when she shoots anyone else they just disappear) and takes time out to pet a deer. Then you got the unlikable characters, 30 year old college students, a lame attempt at a surprise ending and lots, lots more. Avoid at all costs.'
label 0
review b"Icy and lethal ace hit-man Tony Arzenta (a divinely smooth and commanding performance by Alain Delon) wants to quit the assassination business, but the dangerous mobsters he works for won't let him. After his wife and child are killed, Arzenta declares open season on everyone responsible for their deaths. Director Duccio Tessari relates the absorbing story at a constant snappy pace, maintains a properly serious and no-nonsense tone throughout, stages the stirring shoot-outs and exciting car chases with considerable rip-snorting

* check which label 0 and 1 correspond to positive and negative movie reviews

print(raw_train_ds.class_names[0])  # neg
print(raw_train_ds.class_names[1])  # pos

* create validation and test set

In [24]:
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    '/media/veec20/Data/duongdq/TF_tutorial/aclImdb/train',
    batch_size = batch_size,
    validation_split = 0.2,
    subset='validation',
    seed = seed
)

Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [25]:
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    '/media/veec20/Data/duongdq/TF_tutorial/aclImdb/test',
    batch_size = batch_size
)

Found 25000 files belonging to 2 classes.


### Prepare the dataset for training

### Configure the dataset for performance

### Create the model

### Loss function and optimizer

### Train the model

### Evaluate the model

### Create a plot of accuracy and loss over time

### Export the model

## Text classification with TF Hub

## Regression

## Overfit and Underfit

## Save and Load

## Tune hyperparameters with the Keras Tuner

# Load and prepared data

# Estimator