In [1]:
import tensorflow as tf
print("tensorflow version: %s" % tf.__version__)
import edward as ed
print("edward version: %s" % ed.__version__)
import edward.models as edm
import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy.stats import gamma

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


tensorflow version: 1.6.0
edward version: 1.3.5


In [2]:
import edward.inferences as edi
from scipy.stats import norm
import matplotlib.pyplot as plt

In [3]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [4]:
tf.set_random_seed(10)

## Set constants

In [5]:
PHASE_SHIFT_DAYS = [21., 18., 11., 5., 3., 0.1]
PHASES = 6
N = 1
PHASE_COLUMNS = ['green', 'colour_break_1', 'colour_break_2', 'pink', 'cherry', 'blue']
SAMPLES_WEIGHTS = 10

In [6]:
class BlueberryModel():
    
    def __init__(self):
        self.inputs = None
        self.shifts = None
        self.grow = None
        self.outputs = None
        self.latent_vars = None
        self.q_grow = None
        self.latent_var_dict = None
        self.input_ph = None
        self.output_ph = None
        self.sess = None
        self.gamma_params = None
        
    def build_model(self):
        self.inputs = [0] * PHASES
        self.shifts = [0] * PHASES
        self.grow = [0] * PHASES
        self.outputs = [0] * PHASES
        for i in range(PHASES):
            self.inputs[i] = edm.Normal(20., 7., sample_shape=[N])
            shiftmean = self.get_shift_dist(self.inputs[i], PHASE_SHIFT_DAYS[i], PHASE_SHIFT_DAYS[i]/2.)
            self.shifts[i] = edm.Normal(shiftmean, PHASE_SHIFT_DAYS[i]/2.)
            self.grow[i] = self.shifts[i - 1] if i > 0 else edm.Normal(20.,7., sample_shape=[N])
            self.outputs[i] = edm.Normal(self.inputs[i] - self.shifts[i] + self.grow[i], self.inputs[i].scale + self.shifts[i].scale + self.grow[i].scale)
    
    @staticmethod
    def get_shift_dist(n, phase_mean, phase_stddev, days=7.):
        p = norm.cdf(days,phase_mean, phase_stddev)
        return n * p.astype('float32')
    
    def build_latent_vars(self):
        self.latent_vars = [0] * PHASES
        self.q_grow = edm.Normal(loc=tf.Variable(tf.random_uniform([])),
                scale=tf.Variable(tf.random_uniform([])), sample_shape=[N])
        for i in range(PHASES):
            self.latent_vars[i] = edm.Normal(loc=tf.Variable(tf.random_uniform([])),
                scale=tf.Variable(tf.random_uniform([])), sample_shape=[N])
            
    def create_latent_var_dict(self):
        self.latent_var_dict = {self.grow[0]: self.q_grow}
        self.latent_var_dict.update({key: value for key, value in zip(self.shifts, self.latent_vars)})
        
    def create_placeholder(self):
        placeholder = [0] * PHASES
        for i in range(0, PHASES):
            placeholder[i] = tf.placeholder(tf.float32, shape=[N])
        return placeholder    
    
    def create_data_dict(self):
        self.data_dict = {}
        self.data_dict.update({key: value for key, value in zip(self.inputs, self.input_ph)})
        self.data_dict.update({key: value for key, value in zip(self.outputs, self.output_ph)})
    
    def initialize(self):
        tf.reset_default_graph()
        with tf.name_scope("model"):
            self.build_model()
        self.sess = ed.get_session()
        with tf.name_scope("posterior"):
            self.build_latent_vars()
            
        # Create placeholders for the observed data
        self.input_ph = self.create_placeholder()
        self.output_ph = self.create_placeholder()
        
        # Create dictionaries for the latent variables and the observed data (with placeholders for now)
        self.create_latent_var_dict()
        self.create_data_dict()
    
    def do_inference(self, samples):
        n_batches = samples.shape[0] // N

        inf = edi.KLqp(self.latent_var_dict, data=self.data_dict)
        inf.initialize(n_iter=100 * n_batches, n_samples=10)

        self.sess.run(tf.global_variables_initializer())

        for i in range(inf.n_iter):
            data_index = i * N % samples.shape[0]

            # Populate the placeholders
            feed_dict = {}
            # Add inputs, one for each phase
            feed_dict.update({key: [value] for key, value in zip(self.input_ph, samples[data_index, 0:PHASES])})
            # Add outpust, one for each phase
            feed_dict.update({key: [value] for key, value in zip(self.output_ph, samples[data_index, PHASES:2*PHASES])})

            info_dict = inf.update(feed_dict=feed_dict)
            inf.print_progress(info_dict)
            
    def predict(self, samples):
        # copy posterior
        x_post = ed.copy(self.outputs[-1], self.latent_var_dict)
        sess=self.sess
        predictions = np.zeros((samples.shape[0], 3))
        for i in range(0, samples.shape[0]):
            feed_dict = {}
            feed_dict.update({key: [value] for key, value in zip(self.input_ph, samples[i, 0: PHASES])})
            quantile_1, quantile_2, mean = sess.run([x_post.quantile(0.025), x_post.quantile(0.975), x_post.mean()], 
                                                    feed_dict=feed_dict)
            predictions[i, :] = [quantile_1, mean, quantile_2]

        return predictions
            
    def evaluate_model(self, samples):
        predictions = self.predict(samples[:,0: PHASES])
        accuracy = self.calc_accuracy(samples[:, -1], predictions)
        print('Model accuracy:', accuracy, '%')
        
    def fit_weight_prediction(self, weight_data):
        self.gamma_params=stats.gamma.fit(weight_data.avg_blueberry_weight)
                              
        
    def predict_weight(self, samples):
        
        predictions = np.zeros((samples.shape[0], SAMPLES_WEIGHTS))
        for i in range(0, SAMPLES_WEIGHTS):
            print('Sampling from the blueberry number distribution, iteration:', i)
            predictions[:, i] = self.predict(samples)[:,1]
        weight_samples = gamma(self.gamma_params[0], self.gamma_params[1], self.gamma_params[2]).rvs((samples.shape[0], SAMPLES_WEIGHTS))
        weight_predictions = predictions*weight_samples
        mean_weights = np.mean(weight_predictions, axis=1)
        stddev_weights = np.std(weight_predictions, axis=1)

        return mean_weights, stddev_weights
    
    def evaluate_weight_prediction(self, weights):
        self.fit_weight_prediction(weights)
        mean_weights, stddev_weights = self.predict_weight(samples)
        _, p_val = stats.kstest(weights.avg_blueberry_weight, 'gamma', args=self.gamma_params)
        mse = ((mean_weights - weights.weight.values[:-1])**2).mean()
        if p_val < 0.05:
            print('This is a bad approximation')
        else:
            print('The gamma distribution seems to be a good fit.')
        print('Mean squared error for blueberry weight prediction:', mse)
        
    @staticmethod
    def calc_accuracy(samples, predictions):
        return ((samples > predictions[:,0]) & (samples < predictions[:,2])).sum()/len(samples)*100.
    
    @staticmethod
    def save_model(filename):
        saver = tf.train.Saver()
        sess = ed.get_session()
        save_path = saver.save(sess, filename)
        print("Model saved to file: %s" % save_path)
            
            

In [7]:
df_weight = pd.read_csv('weight_data.csv')
df_count = pd.read_csv('count_data.csv')
df_merge = df_count.merge(df_weight, how='left', on='week')
df_merge.loc[:, 'avg_blueberry_weight'] = df_merge.weight/df_merge.blue
df_merge = df_merge.replace(np.inf, None).dropna(subset=['avg_blueberry_weight'])
# samples used to predict blueberry numbers in last phase
df = df_merge[PHASE_COLUMNS]
samples = pd.concat([df,df.shift(-1)], axis=1).values[:-1,:] 
# weight data
weights = df_merge[['blue', 'avg_blueberry_weight', 'weight']]

In [8]:
model = BlueberryModel()
model.initialize()
model.do_inference(samples)
model.fit_weight_prediction(weights)

1700/1700 [100%] ██████████████████████████████ Elapsed: 18s | Loss: 2796.712


In [9]:
model.save_model('./model.cpkt')

Model saved to file: ./model.cpkt


In [10]:
model.evaluate_model(samples)

Model accuracy: 64.70588235294117 %


In [11]:
model.evaluate_weight_prediction(weights)

Sampling from the blueberry number distribution, iteration: 0
Sampling from the blueberry number distribution, iteration: 1
Sampling from the blueberry number distribution, iteration: 2
Sampling from the blueberry number distribution, iteration: 3
Sampling from the blueberry number distribution, iteration: 4
Sampling from the blueberry number distribution, iteration: 5
Sampling from the blueberry number distribution, iteration: 6
Sampling from the blueberry number distribution, iteration: 7
Sampling from the blueberry number distribution, iteration: 8
Sampling from the blueberry number distribution, iteration: 9
The gamma distribution seems to be a good fit.
Mean squared error for blueberry weight prediction: 0.336432439508296
