In [1]:
from gan.doppelganger import DoppelGANger
from gan.util import add_gen_flag, normalize_per_sample, renormalize_per_sample
from gan.load_data import load_data
from gan.network import DoppelGANgerGenerator, Discriminator, AttrDiscriminator
from gan.output import Output, OutputType, Normalization

import matplotlib.pyplot as plt

import time
import os
import tensorflow as tf
import pandas as pd
import numpy as np

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
wwt_df = pd.read_csv("wwt_train.csv", index_col=0)
wwt_df.head()

Unnamed: 0,2015-07-01,2015-07-02,2015-07-03,2015-07-04,2015-07-05,2015-07-06,2015-07-07,2015-07-08,2015-07-09,2015-07-10,...,2016-12-28,2016-12-29,2016-12-30,2016-12-31,domain,access,agent,domain_name,access_name,agent_name
0,-0.333795,-0.319665,-0.314942,-0.308651,-0.291297,-0.312655,-0.309529,-0.306918,-0.310413,-0.317752,...,-0.235855,-0.241672,-0.254042,-0.266611,3,0,0,es.wikipedia.org,all-access,all-agents
1,-0.692347,-0.699509,-0.7445,-0.76926,-0.7445,-0.846174,-0.784077,-0.724269,-0.699509,-0.76926,...,-0.699509,-0.707164,-0.76926,-0.707164,6,1,0,ru.wikipedia.org,desktop,all-agents
2,-0.291111,-0.298633,-0.34625,-0.431433,-0.404062,-0.324597,-0.317989,-0.342335,-0.344124,-0.390935,...,-0.528577,-0.531794,-0.563716,-0.596365,3,0,0,es.wikipedia.org,all-access,all-agents
3,-0.565913,-0.565913,-0.570443,-0.570443,-0.596365,-0.575165,-0.60549,-0.570443,-0.518001,-0.580098,...,-0.590673,-0.559447,-0.582649,-0.561561,2,0,1,en.wikipedia.org,all-access,spider
4,-0.43012,-0.547547,-0.608707,-0.538521,-0.505735,-0.449311,-0.408269,-0.39514,-0.390022,-0.418952,...,-0.518001,-0.503184,-0.519452,-0.540268,2,0,1,en.wikipedia.org,all-access,spider


In [3]:
SAMPLE_LEN = 10

NUM_SAMPLES = 50000

In [4]:
raw_attributes = wwt_df[["domain", "access", "agent"]].to_numpy()

attributes = []
for i in range(raw_attributes.shape[1]):
    a = np.zeros((raw_attributes.shape[0], np.max(raw_attributes[:,i]) + 1), dtype="uint8")
    a[np.arange(raw_attributes.shape[0]), raw_attributes[:,i]] = 1
    attributes.append(a)
attributes = np.concatenate(attributes, axis=1)
attributes.shape

(50000, 14)

In [5]:
attribute_outputs = [
    Output(OutputType.DISCRETE, np.max(raw_attributes[:,i]) + 1)
    for i in range(raw_attributes.shape[1])
]
attribute_outputs

[<gan.output.Output at 0x7f6c171cf3d0>,
 <gan.output.Output at 0x7f6c176a6290>,
 <gan.output.Output at 0x7f6c171cf6d0>]

In [6]:
features = np.expand_dims(wwt_df.iloc[:,:550].to_numpy(), axis=-1)
features.shape

(50000, 550, 1)

In [7]:

feature_outputs = [
    Output(OutputType.CONTINUOUS, 1, Normalization.MINUSONE_ONE)
]
feature_outputs

[<gan.output.Output at 0x7f6c16d0c4d0>]

In [8]:
(train_features, train_attributes, train_attribute_outputs,
 train_real_attribute_mask) = normalize_per_sample(
    features,
    attributes,
    feature_outputs,
    attribute_outputs
)

print(train_features.shape)
print(train_attributes.shape)
print(train_attribute_outputs)
print(train_real_attribute_mask)



(50000, 550, 1)
(50000, 16)
[<gan.output.Output object at 0x7f6c171cf3d0>, <gan.output.Output object at 0x7f6c176a6290>, <gan.output.Output object at 0x7f6c171cf6d0>, <gan.output.Output object at 0x7f6c165f0810>, <gan.output.Output object at 0x7f6c1727b990>]
[True, True, True, False, False]


In [9]:
train_gen_flag = np.ones((train_features.shape[0], train_features.shape[1]))
train_features, train_feature_outputs = add_gen_flag(
    train_features, train_gen_flag, feature_outputs, SAMPLE_LEN,
)
print(train_features.shape)
print(train_feature_outputs)

(50000, 550, 3)
[<gan.output.Output object at 0x7f6c16d0c4d0>, <gan.output.Output object at 0x7f6c1733c650>]


In [10]:
# Model definition

generator = DoppelGANgerGenerator(
    feed_back=False,
    noise=True,
    feature_outputs=train_feature_outputs,
    attribute_outputs=train_attribute_outputs,
    real_attribute_mask=train_real_attribute_mask,
    sample_len=SAMPLE_LEN,
)
discriminator = Discriminator()
attr_discriminator = AttrDiscriminator()

In [11]:
checkpoint_dir = "wwt_run/checkpoint"
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
sample_dir = "wwt_run/sample"
if not os.path.exists(sample_dir):
    os.makedirs(sample_dir)

time_path = "wwt_run/time.txt"

length = int(train_features.shape[1] / SAMPLE_LEN)

start_time = time.time()

run_config = tf.ConfigProto()
with tf.Session(config=run_config) as sess:

    gan = DoppelGANger(
        sess=sess,
        checkpoint_dir=checkpoint_dir,
        sample_dir=sample_dir,
        time_path=time_path,
        epoch=400,
        batch_size=100,
        data_feature=train_features,
        data_attribute=train_attributes,
        real_attribute_mask=train_real_attribute_mask,
        data_gen_flag=train_gen_flag,
        sample_len=SAMPLE_LEN,
        data_feature_outputs=train_feature_outputs,
        data_attribute_outputs=train_attribute_outputs,
        vis_freq=200,
        vis_num_sample=5,
        generator=generator,
        discriminator=discriminator,
        attr_discriminator=attr_discriminator,
        d_gp_coe=10.0,
        attr_d_gp_coe=10.0,
        g_attr_d_coe=1.0,
        d_rounds=1,
        g_rounds=1,
        num_packing=1,
        extra_checkpoint_freq=5,
    )

    gan.build()
    gan.train()
    
    # Generate some data
    real_attribute_input_noise = gan.gen_attribute_input_noise(NUM_SAMPLES)
    addi_attribute_input_noise = gan.gen_attribute_input_noise(NUM_SAMPLES)
    feature_input_noise = gan.gen_feature_input_noise(NUM_SAMPLES, length)
    input_data = gan.gen_feature_input_data_free(NUM_SAMPLES)
    
    internal_features, internal_attributes, gen_flags, lengths = gan.sample_from(
        real_attribute_input_noise, addi_attribute_input_noise,
        feature_input_noise, input_data
    )
    
end_time = time.time()

print(f"Elapsed time: {end_time} seconds")





2022-05-10 17:56:11.947672: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA
2022-05-10 17:56:11.953206: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2299995000 Hz
2022-05-10 17:56:11.954019: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x558e8ff0d510 executing computations on platform Host. Devices:
2022-05-10 17:56:11.954127: I tensorflow/compiler/xla/service/service.cc:175]   StreamExecutor device (0): <undefined>, <undefined>
2022-05-10 17:56:11.955139: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcuda.so.1





Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


2022-05-10 17:56:12.578103: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1005] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-10 17:56:12.578787: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1640] Found device 0 with properties: 
name: Tesla T4 major: 7 minor: 5 memoryClockRate(GHz): 1.59
pciBusID: 0000:00:04.0
2022-05-10 17:56:12.579076: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcudart.so.10.1
2022-05-10 17:56:12.580721: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcublas.so.10
2022-05-10 17:56:12.582241: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcufft.so.10
2022-05-10 17:56:12.582526: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcurand.so.10
2

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor




Instructions for updating:
Use `tf.cast` instead.


























Layers of DoppelGANgerGenerator
[<tf.Variable 'DoppelGANgerGenerator/attribute_real/layer0/linear/dense/kernel:0' shape=(5, 100) dtype=float32_ref>, <tf.Variable 'DoppelGANgerGenerator/attribute_real/layer0/linear/dense/bias:0' shape=(100,) dtype=float32_ref>, <tf.Variable 'DoppelGANgerGenerator/attribute_real/layer0/batch_norm/beta:0' shape=(100,) dtype=float32_ref>, <tf.Variable 'DoppelGANgerGenerator/attribute_real/layer0/batch_norm/gamma:0' shape=(100,) dtype=float32_ref>, <tf.Variable 'DoppelGANgerGenerator/attribute_real/layer0/batch_norm/moving_mean:0' shape=(100,) dtype=float32_ref>, <tf.Variable 'DoppelGANgerGenerator/attribute_real/layer0/batch_norm/moving_variance:0' shape=(100,) dtype=float32_ref>, <tf.Variable 'DoppelGANgerGenerator/attribute_real/layer1/linear/dense/kernel:0' shape=(100, 100) dtype=float32_ref>, <tf.Variable 'DoppelGANgerGenerator/attribute_real/layer1/linear/dense/bias:0' shape=(100,) dtype=float32_ref>, <tf.Variable 'DoppelGANgerGenerator/attribute_real




Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where






  0%|                                                                                         | 0/400 [00:00<?, ?it/s]2022-05-10 17:56:22.964090: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcublas.so.10
2022-05-10 17:56:23.169609: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcudnn.so.7
  1%|▉                                                                            | 5/400 [09:55<13:06:27, 119.46s/it]

Instructions for updating:
Use standard file APIs to delete files with this prefix.


100%|███████████████████████████████████████████████████████████████████████████| 400/400 [12:57:41<00:00, 116.65s/it]


Elapsed time: 1652252070.322122 seconds


In [12]:
features, attributes = renormalize_per_sample(
    internal_features, internal_attributes, train_feature_outputs,
    train_attribute_outputs, gen_flags,
    num_real_attribute=len(train_attribute_outputs)-2
)

In [13]:
print(features.shape)
print(attributes.shape)

(50000, 550, 1)
(50000, 14)


In [14]:
synthetic_wwt_df = pd.DataFrame(features.reshape(features.shape[0], -1))
synthetic_wwt_df.columns = wwt_df.columns[:features.shape[1]]

# Convert from softmax/onehot to categorical
synthetic_wwt_df["domain"] = np.argmax(attributes[:,:9], axis=1)
synthetic_wwt_df["access"] = np.argmax(attributes[:,9:12], axis=1)
synthetic_wwt_df["agent"] = np.argmax(attributes[:,12:], axis=1)

synthetic_wwt_df.head()

Unnamed: 0,2015-07-01,2015-07-02,2015-07-03,2015-07-04,2015-07-05,2015-07-06,2015-07-07,2015-07-08,2015-07-09,2015-07-10,...,2016-12-25,2016-12-26,2016-12-27,2016-12-28,2016-12-29,2016-12-30,2016-12-31,domain,access,agent
0,-0.615876,-0.554767,-0.54856,-0.616219,-0.629397,-0.741533,-0.742521,-0.588568,-0.455513,-0.645769,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,2,0,1
1,-0.468078,-0.427944,-0.456671,-0.47599,-0.46028,-0.481095,-0.447726,-0.430357,-0.438967,-0.428374,...,-0.333785,-0.430797,-0.443526,-0.448922,-0.447163,-0.406068,-0.417095,4,2,0
2,-0.271312,-0.243591,-0.205797,-0.276314,-0.222196,-0.2751,-0.205425,-0.272264,-0.274321,-0.275742,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,6,0,0
3,-0.407226,-0.393146,-0.396996,-0.396565,-0.405249,-0.406002,-0.398638,-0.400576,-0.372041,-0.395549,...,-0.111871,-0.153673,-0.155286,-0.109415,-0.150966,-0.164986,-0.174724,2,1,0
4,-0.416292,-0.424349,-0.352991,-0.454946,-0.395671,-0.463028,-0.229393,-0.421823,-0.40485,-0.410251,...,-0.228286,-0.233218,-0.326942,-0.262122,-0.270156,-0.287923,-0.207713,5,2,0


In [15]:
synthetic_wwt_df.to_csv("synthetic_tf1.csv")