# Train a deep learning model
In this notebook you will train a deep learning model to classify the descriptions of parts components as compliant or non-compliant. 

Each document in the supplied training data set is a short text description of the component as documented by an authorized technician. 
The contents include:
- Manufacture year of the component (e.g. 1985, 2010)
- Condition of the component (poor, fair, good, new)
- Materials used in the component (plastic, carbon fiber, steel, iron)

The compliance regulations dictate:
*Any component manufactured before 1995 or in fair or poor condition or made with plastic or iron is out of compliance.*

For example:
* Manufactured in 1985 made of steel in fair condition -> **Non-compliant**
* Good condition carbon fiber component manufactured in 2010 -> **Compliant**
* Steel component manufactured in 1995 in fair condition -> **Non-Compliant**

The labels present in this data are 0 for compliant, 1 for non-compliant.

The challenge with classifying text data is that deep learning models only undertand vectors (e.g., arrays of numbers) and not text. To encode the car component descriptions as vectors, we use an algorithm from Stanford called [GloVe (Global Vectors for Word Representation)](https://nlp.stanford.edu/projects/glove/). GloVe provides us pre-trained vectors that we can use to convert a string of text into a vector.

In [None]:
import logging
import os
import random
import re

from matplotlib import pyplot as plt
from matplotlib.pyplot import imshow
import numpy as np
import pandas as pd

import keras
from keras import models 
from keras import layers
from keras import optimizers

In [None]:
project_folder = './dl'
deployment_folder = './deploy'

# create project folder
if not os.path.exists(project_folder):
    os.makedirs(project_folder)

In [None]:
# this is the URL to the CSV file containing the GloVe vectors
glove_url = ('https://davewdemoblobs.blob.core.windows.net/mlops/glove.6B.100d.txt')
# this is the URL to the CSV file containing the care component descriptions
data_url = ('https://davewdemoblobs.blob.core.windows.net/mlops/components.csv')

def download_glove():
    print("Downloading GloVe embeddings...")
    import urllib.request
    urllib.request.urlretrieve(glove_url, 'glove.6B.100d.txt')
    print("Download complete.")

download_glove()


# Load the components labeled data
print("Loading components data...")
components_df = pd.read_csv(data_url)
components = components_df["text"].tolist()
labels = components_df["label"].tolist()
print("Loading components data completed.")

In [None]:
# split data 60% for trianing, 20% for validation, 20% for test
print("Splitting data...")
train, validate, test = np.split(components_df.sample(frac=1), [int(.6*len(components_df)), int(.8*len(components_df))])
print(train.shape)
print(test.shape)
print(validate.shape)

# use the Tokenizer from Keras to "learn" a vocabulary from the entire car components text
print("Tokenizing data...")
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

maxlen = 100                                           
training_samples = 90000                                 
validation_samples = 5000    
max_words = 10000      

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(components)
sequences = tokenizer.texts_to_sequences(components)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=maxlen)

labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

indices = np.arange(data.shape[0])                     
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]

x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

x_test = data[training_samples + validation_samples:]
y_test = labels[training_samples + validation_samples:]
print("Tokenizing data complete.")

In [None]:
# apply the vectors provided by GloVe to create a word embedding matrix
print("Applying GloVe vectors...")
glove_dir =  './'

embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

embedding_dim = 100

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector    
print("Applying GloVe vectors compelted.")

# use Keras to define the structure of the deep neural network   
print("Creating model structure...")
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

# fix the weights for the first layer to those provided by the embedding matrix
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False
print("Creating model structure completed.")

opt = optimizers.RMSprop(lr=0.1)

print("Training model...")
model.compile(optimizer=opt,
              loss='binary_crossentropy',
              metrics=['acc'])
history = model.fit(x_train, y_train,
                    epochs=1, 
                    batch_size=32,
                    validation_data=(x_val, y_val))
print("Training model completed.")

print("Saving model files...")
# create a ./outputs/model folder in the compute target
# files saved in the "./outputs" folder are automatically uploaded into run history
os.makedirs('./outputs/model', exist_ok=True)
# save model
model.save('./outputs/model/model.h5')
print("model saved in ./outputs/model folder")
print("Saving model files completed.")

## Restore the model from model.h5 file

In [None]:
from keras.models import load_model

model = load_model('./model/model.h5')
print("Model loaded from disk.")
print(model.summary())

You can also evaluate how accurately the model performs against data it has not seen. Run the following cell to load the test data that was not used in either training or evaluating the model. 

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

# Load the components labeled data
components_df = pd.read_csv(data_url)
components = components_df["text"].tolist()
labels = components_df["label"].tolist()

maxlen = 100                                           
training_samples = 90000                                 
validation_samples = 5000    
max_words = 10000      

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(components)
sequences = tokenizer.texts_to_sequences(components)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=maxlen)

labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

indices = np.arange(data.shape[0])                     
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_test = data[training_samples + validation_samples:]
y_test = labels[training_samples + validation_samples:]

Run the following cell to see the accuracy on the test set (it is the second number in the array displayed, on a scale from 0 to 1).

In [None]:
print('Model evaluation will print the following metrics: ', model.metrics_names)
evaluation_metrics = model.evaluate(x_test, y_test)
print(evaluation_metrics)