# Tutorial 2 - GANs

Let's generate some (fake) NORMAL data based on Tutorial 1.

# Setup

In [None]:
# Common imports
import numpy as np
import pandas as pd

random_state=42

# Get the data

In [None]:
airbnb_normal = pd.read_csv("airbnb-normal.csv")


In [None]:
airbnb_normal.shape

# Data Prep

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.preprocessing import FunctionTransformer

##  Identify the numerical and categorical columns

In [None]:
airbnb_normal.dtypes

**At this stage, you can manually identify numeric, binary, and categorical columns as follows:**

`numeric_columns = ['latitude', 'longitude', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'Number of amenities', 'guests_included', 'price_per_extra_person', 'minimum_nights', 'number_of_reviews', 'number_days_btw_first_last_review', 'review_scores_rating']`
 
 `binary_columns = ['host_is_superhost', 'host_identity_verified']`
 
 `categorical_columns = ['neighbourhood_cleansed', 'property_type', 'room_type', 'bed_type', 'cancellation_policy']`
 
<br>
 
**If you do not want to manually type these, you can do the below tricks:**

In [None]:
# Identify the numerical columns
numeric_columns = airbnb_normal.select_dtypes(include=[np.number]).columns.to_list()

# Identify the categorical columns
categorical_columns = airbnb_normal.select_dtypes('object').columns.to_list()

In [None]:
# Identify the binary columns so we can pass them through without transforming
binary_columns = ['host_is_superhost', 'host_identity_verified']

In [None]:
# Be careful: numerical columns already includes the binary columns,
# So, we need to remove the binary columns from numerical columns.

for col in binary_columns:
    numeric_columns.remove(col)

In [None]:
binary_columns

In [None]:
numeric_columns

In [None]:
categorical_columns

# Pipeline

In [None]:
numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())])

In [None]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [None]:
binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])

In [None]:
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('binary', binary_transformer, binary_columns)],
        remainder='passthrough')

#passtrough is an optional step. You don't have to use it.

# Transform: fit_transform() for NORMAL data

In [None]:
#Fit and transform the train data
normal_x = preprocessor.fit_transform(airbnb_normal)

normal_x

In [None]:
normal_x.shape

# GAN

In [None]:
import tensorflow as tf
from tensorflow import keras

In [None]:
codings_size = 100   # this is the number of input variables we want the generator to use

generator = keras.models.Sequential([
    keras.layers.InputLayer(input_shape=codings_size),
    keras.layers.Dense(90, activation="relu"),
    keras.layers.Dense(80, activation="relu"),
    keras.layers.Dense(58, activation=None)    
])

# Note1: we must have 60 variables in the last layer. The real data has 60 variables
# Note2: we use no activation in the last layer. 
# This uses a linear function so we have continous values for the 60 variables

In [None]:
generator.summary()

In [None]:
discriminator = keras.models.Sequential([
    keras.layers.InputLayer(input_shape=[58]),
    keras.layers.Dense(50, activation="relu"),
    keras.layers.Dense(25, activation="relu"),
    keras.layers.Dense(1, activation="sigmoid")
])

# Note: We must use 9 as the input shape. This accepts real data

In [None]:
discriminator.summary()

In [None]:

gan = keras.models.Sequential([generator, discriminator])

In [None]:
discriminator.compile(loss="binary_crossentropy", optimizer="Adam")
discriminator.trainable = False
gan.compile(loss="mean_squared_error", optimizer="Adam")

In [None]:
batch_size = 32

dataset = tf.data.Dataset.from_tensor_slices(normal_x).shuffle(1000)

dataset = dataset.batch(batch_size, drop_remainder=True).prefetch(1)

In [None]:
def train_gan(gan, dataset, batch_size, codings_size, n_epochs=10):
    generator, discriminator = gan.layers
    for epoch in range(n_epochs):
        for X_batch in dataset:
            # phase 1 - training the discriminator
            noise = tf.random.normal(shape=[batch_size, codings_size])
            generated_data = tf.cast(generator(noise), tf.float64)
            X_fake_and_real = tf.concat([generated_data, X_batch], axis=0)
            y1 = tf.constant([[0.]] * batch_size + [[1.]] * batch_size)
            discriminator.trainable = True
            discriminator.train_on_batch(X_fake_and_real, y1)
            # phase 2 - training the generator
            noise = tf.random.normal(shape=[batch_size, codings_size])
            y2 = tf.constant([[1.]] * batch_size)
            discriminator.trainable = False
            gan.train_on_batch(noise, y2)
        print("Epoch: {}/{}".format(epoch, n_epochs))
        

In [None]:
train_gan(gan, dataset, batch_size, codings_size, n_epochs=10)

# 10 epochs are not enough!!!

# Generate new data using trained generator

In [None]:
noise = tf.random.normal(shape=[1, codings_size])
generated_data = tf.cast(generator(noise), tf.float64)

In [None]:
generated_data

# How can you check the validity of the "fake" data?

Save the data, go back to Tutorial 1, and send into the Autoencoder. If the error rate is low, it is "normal" data.