In [None]:
#install the required packages

%%capture
!pip install gretel_client pandas matplotlib numpy scipy torch

In [None]:
#import necessary packages to use the DGAN API

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch

import yaml

from getpass import getpass
from gretel_client import configure_session, ClientConfig
from gretel_client.helpers import poll
from gretel_client.projects.projects import get_project

In [None]:
#configure session through the prompt method

configure_session(api_key="prompt", validate=True, cache="no", endpoint="https://api-dev.gretel.cloud")

In [None]:
#download and load the oil datasets that we will generate synthetic data for
def get_oil():
    wti = pd.read_csv('https://datahub.io/core/oil-prices/r/wti-daily.csv')
    brent = pd.read_csv('https://datahub.io/core/oil-prices/r/brent-daily.csv')
    wti.columns = ['Date', 'WTI Price']
    brent.columns = ['Date', 'Brent Price']
    oil = wti.merge(brent)
    return oil

In [None]:
#view the oil data
df = get_oil()
df.head()

In [None]:
#generate attribute column which is needed in order to use long style frames in the DGAN MIF Framework. We can set the 
#example size by setting the seq_len size.
def generate_dataframe_with_batches(df, batch_size):
    df_ = df[:math.floor(len(df)/batch_size)*batch_size]
    columns = []
    for i in range(0, len(df_), batch_size):
        for j in range(batch_size):
            columns.append(i)
    df_['attributes'] = columns
    return df_

In [None]:
seq_len = 6
df_input = generate_dataframe_with_batches(df, batch_size)

In [None]:
# Setup config and train model

TMP_FILE = "tmp_train.csv"

CONFIG_STRING = f"""
schema_version: 1.0

name: "oildata"

models:
  - timeseries_dgan:
        data_source: "_"

        time_column: "Date"
        example_id_column: "attributes"
        df_style: "long"
        
        params:
            epochs: 10
            max_sequence_len: {seq_len}
            sample_len: {seq_len}  # Must evenly divide max_sequence_len, length of time series
            batch_size: 5000 
            generator_learning_rate: 0.0001
            discriminator_learning_rate: 0.0001
            attribute_discriminator_learning_rate: 1e-4
            apply_feature_scaling: True
            apply_example_scaling: True
            feature_num_layers: 3
            feature_num_units: 100
            feature_noise_dim: 10
            
        generate:
            num_records: 50000

"""
config = yaml.safe_load(CONFIG_STRING)

project = get_project(display_name="DGAN", create=True)

print(f"Follow model training at: {project.get_console_url()}")

model = project.create_model_obj(model_config=config)

df_input.to_csv(TMP_FILE, index=False)
model.data_source = TMP_FILE

model.submit(upload_data_source=True)

poll(model)

In [None]:
# Grab synthetic data

synthetic_df = pd.read_csv(model.get_artifact_link("data_preview"), compression="gzip")
synthetic_df = synthetic_df.drop(columns = 'attributes')
synthetic_df.head()

In [None]:
# Let's compare the correlations in the synthetic data between the variables and the correlations in the real data between the variables. 
# We want to see that all the cells are as close to 0 as possible.
print("Difference in real correlations and synethic data correlations:")
print(df.iloc[: , 1:].corr() - synthetic_df.iloc[: , 1:].corr())

In [None]:
# Now, let's visualize the probability distribution of each feature and it's respective synthetic data alternate.
for val in list(df.iloc[:,1:].columns):
    plt.figure(figsize = (16,8))
    plt.subplot(1, 2, 1)
    plt.hist([np.array(df[val]), np.array(synthetic_df[val])], 
             label=["real", "synthetic"],
             bins=50,
             density=True,
             )
    plt.legend()
    plt.xlabel(val)
    plt.ylabel("Density")
    plt.show()

In [None]:
#Functions to calculate autocorrelation which will be visualized below.
def autocorr(X, Y):
    EPS = 1e-8
    Xm = torch.mean(X, 1).unsqueeze(1)
    Ym = torch.mean(Y, 1).unsqueeze(1)
    r_num = torch.sum((X - Xm) * (Y - Ym), 1)
    r_den = torch.sqrt(torch.sum((X - Xm)**2, 1) * torch.sum((Y - Ym)**2, 1))

    r_num[r_num == 0] = EPS
    r_den[r_den == 0] = EPS

    r = r_num / r_den
    r[r > 1] = 0
    r[r < -1] = 0

    return r
    
def get_autocorr(feature):
    feature = torch.from_numpy(feature)
    feature_length = feature.shape[1]
    autocorr_vec = torch.Tensor(feature_length - 2)

    for j in range(1, feature_length - 1):
      autocorr_vec[j - 1] = torch.mean(autocorr(feature[:, :-j], feature[:, j:]))

    return autocorr_vec.cpu().detach().numpy()

In [None]:
#function to generate numpy array in order to visualize the autocorrelation between real and synthetic data
def generate_numpy_for_autocorr(df, batch_size):
    features = df.iloc[: , 1:].to_numpy()
    n = features.shape[0] // batch_size

    # Shape is now (# examples, # time points, # features)
    features = features[:(n*batch_size),:].reshape(-1, batch_size, features.shape[1])
    return features

In [None]:
#Generate autocorrelation features from synthetic and real data and plot!
synthetic_acf = get_autocorr(generate_numpy_for_autocorr(df, seq_len))
acf = get_autocorr(generate_numpy_for_autocorr(synthetic_df, seq_len))
# Figure 1, autocorrelation
plt.plot(acf, label="real")
plt.plot(synthetic_acf, label="generated")
plt.xlabel("Time lag (days)")
plt.ylabel("Autocorrelation")
plt.title("Autocorrelation of Heartbeat 1 and Heartbeat 2")
plt.legend()
plt.show()