# Training and Testing an MDRNN on Synthetic Data

This script trains and tests a mixture density recurrent neural network (MDRNN) on synthetic performances.

Ideas to investigate:

- give the data a variety of periods instead of 15000 samples in the same
- save all the waves together (done)

In [None]:
%matplotlib inline
import numpy as np
import scipy.signal
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import time

## Parameters for time distribution (found from studying human sourced data)
time_mean = 0.044860
time_std = 0.183995
time_min = 0.000454
time_max = 8.463944

def gen_function_data(function_generator, freq, num_samples):
    """Generates data using a generator function."""
    t_diffs = np.random.normal(loc=time_mean, scale=time_std, size=num_samples)
    out_df = pd.DataFrame({'dt' : t_diffs})
    out_df.dt = out_df.dt.abs()
    out_df['seconds'] = out_df.dt.cumsum()
    out_df['value'] = (function_generator(freq * 2 * np.pi * out_df.seconds) * 0.5) + 0.5
    return out_df

def gen_noise(num_samples):
    t_diffs = np.random.normal(loc=time_mean, scale=time_std, size=num_samples)
    values = np.random.uniform(low=0.0, high=1.0, size=num_samples)
    out_df = pd.DataFrame({'dt' : t_diffs, 'value' : values})
    out_df.dt = out_df.dt.abs()
    out_df['seconds'] = out_df.dt.cumsum()
    return out_df

NSAMPLE = 150000
# Generate Synthetic data from different functions with different frequencies.
output_dfs = []
functions = [np.sin, scipy.signal.square, (lambda t: scipy.signal.sawtooth(t, width=0.5))]
frequencies = [0.1, 0.35, 0.6, 0.85, 1.1]
num_cases = len(frequencies) * len(functions)

for f in functions:
    for freq in frequencies:
        dat = gen_function_data(f, freq, NSAMPLE//(num_cases))
        output_dfs.append(dat)

# Generating Synth Sourced Data

Generating some data sourced from regular functions.

In [None]:
# Save synthetic Data in a compressed numpy file.
dataset_location = '../datasets/'
dataset_filename = 'empi-synthetic-dataset.npz'
log_arrays = [np.array(log_df[['dt', 'value']]) for log_df in output_dfs]

## Generate some stats and convert to compressed formats.
raw_perfs = []
acc = 0
time = 0
interactions = 0
for l in log_arrays:
    acc += l.shape[0] * l.shape[1]
    interactions += l.shape[0]
    time += l.T[0].sum()
    raw = l.astype('float32')  # dt, x_1, ... , x_n
    raw_perfs.append(raw)

print("total number of values:", acc)
print("total number of interactions:", interactions)
print("total time represented:", time, "seconds")
print("total number of perfs in raw array:", len(raw_perfs))
raw_perfs = np.array(raw_perfs)
np.savez_compressed(dataset_location + dataset_filename, perfs=raw_perfs)
print("done saving:", dataset_location + dataset_filename)

# Generating Noise Sourced Data

Generating some data sourced from uniform noise.

In [None]:
# Save noise Data in a compressed numpy file.
NSAMPLE = 45000 #150000
dataset_location = '../datasets/'
dataset_filename = 'empi-noise-dataset.npz'
log_arrays = [np.array(gen_noise(NSAMPLE)[['dt', 'value']])]

## Generate some stats and convert to compressed formats.
raw_perfs = []
acc = 0
time = 0
interactions = 0
for l in log_arrays:
    acc += l.shape[0] * l.shape[1]
    interactions += l.shape[0]
    time += l.T[0].sum()
    raw = l.astype('float32')  # dt, x_1, ... , x_n
    raw_perfs.append(raw)

print("total number of values:", acc)
print("total number of interactions:", interactions)
print("total time represented:", time, "seconds")
print("total number of perfs in raw array:", len(raw_perfs))
raw_perfs = np.array(raw_perfs)
np.savez_compressed(dataset_location + dataset_filename, perfs=raw_perfs)
print("done saving:", dataset_location + dataset_filename)

## Generating Synthetic data:

The idea is to generate some data that "could" have been human, but has a regular pattern. The time axis is sampled from a normal distribution with the same mean and S.D. as the human corpus. The value axis is generated by applying regular signal functions (e.g., sine, square, triangle) to this time axis.


In [None]:
small_data_length=1000

def plot_gen_data(data_df, name="unknown", colour='b'):
    """Plot the data in output_df assuming it has columns x and time"""
    sns.set_style('whitegrid')
    sns.set_context('paper')
    f, ax = plt.subplots(figsize=(3, 2))
    sns.lineplot(x='seconds', y='value', data=data_df[:300], ax=ax, legend=False, color=colour)
    ax.set_xlabel("seconds")
    ax.set_ylabel("position")
    ax.set(ylim=(-0.1, 1.1))
    sns.despine(f, left=True, bottom=True)
    ax.get_figure().savefig('../images/synth_data_output/'+name+'_data_output.pdf', dpi=300, bbox_inches="tight")
    ax.get_figure().savefig('../images/synth_data_output/'+name+'_data_output.png', dpi=300, bbox_inches="tight")

colour = sns.color_palette('deep')[0]

# Generate fake Sine movement
sine_df = gen_function_data(np.sin, 0.1, small_data_length)
display(sine_df.describe())
plot_gen_data(sine_df, name='sine', colour=colour)

# Generate fake Squarey Movement
square_df = gen_function_data(scipy.signal.square, 0.1, small_data_length)
display(square_df.describe())
plot_gen_data(square_df, name='square', colour=colour)

# Generate fake triangle-y Movement
tri_df = gen_function_data(scipy.signal.sawtooth, 0.1, small_data_length)
display(tri_df.describe())
plot_gen_data(tri_df, name='tri', colour=colour)

# Generate noise
noise_df = gen_noise(small_data_length)
display(noise_df.describe())
plot_gen_data(noise_df, name='noise', colour=colour)