In [24]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

import numpy as np
import json

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from google.cloud import bigquery
from witwidget.notebook.visualization import WitWidget, WitConfigBuilder

query="""
SELECT
  weight_pounds,
  is_male,
  mother_age,
  plurality,
  gestation_weeks
FROM
  publicdata.samples.natality
WHERE year > 2000
LIMIT 10000
"""
df = bigquery.Client().query(query).to_dataframe()
df.head()

# This shows the mean, standard deviation, minimum, and other metrics for our numeric columns.
df.describe()

# Get some data on our boolean column indicating the baby's gender.
df['is_male'].value_counts()

True     5158
False    4842
Name: is_male, dtype: int64

In [25]:
# Drop rows with null values from the dataset and shuffle the data.
df = df.dropna()
df = shuffle(df, random_state=2)

# Extract the label column into a separate variable and create a DataFrame with only our features.
# Since is_male is a boolean, we'll convert it to an integer so that all inputs to our model are numeric.
labels = df['weight_pounds']
data = df.drop(columns=['weight_pounds'])
data['is_male'] = data['is_male'].astype(int)
df.head()

Unnamed: 0,weight_pounds,is_male,mother_age,plurality,gestation_weeks
757,7.374463,True,23,1,39.0
6544,8.375361,True,19,1,38.0
3190,6.239082,True,28,1,40.0
2064,8.437091,False,32,1,40.0
4066,7.374463,False,17,1,40.0


In [26]:
# Split your data into train and test sets.
x,y = data,labels
x_train,x_test,y_train,y_test = train_test_split(x,y)

# We'll be building this model using the tf.keras Sequential model API,
# which lets us define our model as a stack of layers.
model = Sequential([
    Dense(64, activation='relu', input_shape=(len(x_train.iloc[0]),)),
    Dense(32, activation='relu'),
    Dense(1)]
)

# Then we'll compile our model so we can train it.
# Here we'll choose the model's optimizer, loss function,
# and metrics we'd like the model to log during training.
# Since this is a regression model (predicting a numerical value),
# we're using mean squared error instead of accuracy as our metric.
model.compile(optimizer=tf.keras.optimizers.RMSprop(),
              loss=tf.keras.losses.MeanSquaredError(),
              metrics=['mae', 'mse'])

# See the shape and number of trainable parameters of your model at each layer.
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 64)                320       
_________________________________________________________________
dense_10 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 33        
Total params: 2,433
Trainable params: 2,433
Non-trainable params: 0
_________________________________________________________________


In [27]:
# Use the optional validation_split parameter,
# which will hold a portion of our training data
# to validate the model at each step.
model.fit(x_train, y_train, epochs=10, validation_split=0.1)

# See how the model is performing.
num_examples = 10
predictions = model.predict(x_test[:num_examples])

Train on 6665 samples, validate on 741 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [28]:
# Iterate over our model's predictions, comparing them to the actual value.
for i in range(num_examples):
    print('Predicted val: ', predictions[i][0])
    print('Actual val: ',y_test.iloc[i])
    print()

Predicted val:  7.1887546
Actual val:  7.50012615324

Predicted val:  7.6715326
Actual val:  7.3744626639

Predicted val:  7.501384
Actual val:  8.12623897732

Predicted val:  6.2491465
Actual val:  6.98424446016

Predicted val:  6.997168
Actual val:  6.1883756943399995

Predicted val:  6.9370313
Actual val:  6.3735639944199995

Predicted val:  6.0059104
Actual val:  8.50102482272

Predicted val:  6.141087
Actual val:  5.93704871566

Predicted val:  6.3343625
Actual val:  6.87621795178

Predicted val:  6.6541934
Actual val:  8.437090766739999



In [29]:
# To make the most of the What-If Tool, we'll send it examples from our test
# set along with the ground truth labels for those examples (y_test).
# Create a new DataFrame with our test examples and their labels.
wit_data = pd.concat([x_test, y_test], axis=1)

# Connect the What-If tool to the model that we've just trained.
def custom_predict(examples_to_infer):
    preds = model.predict(examples_to_infer)
    return preds

# Instantiate the What-If Tool by passing it 500 examples from
# the concatenated test dataset + ground truth labels we just created.
config_builder = (WitConfigBuilder(wit_data[:500].values.tolist(), data.columns.tolist() + ['weight_pounds'])
  .set_custom_predict_fn(custom_predict)
  .set_target_feature('weight_pounds')
  .set_model_type('regression'))
WitWidget(config_builder, height=800)

WitWidget(config={'model_type': 'regression', 'label_vocab': [], 'feature_names': ['is_male', 'mother_age', 'pâ€¦