In [None]:
# https://github.com/jeffheaton/t81_558_deep_learning/blob/master/t81_558_class5_class_reg.ipynb

In [None]:
from matplotlib.pyplot import figure, show
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pandas as pd
import os
import numpy as np
from sklearn import metrics
from scipy.stats import zscore
import tensorflow as tf
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
import json

In [None]:
# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df.as_matrix(result).astype(np.float32), dummies.as_matrix().astype(np.float32)
    else:
        # Regression
        return df.as_matrix(result).astype(np.float32), df.as_matrix([target]).astype(np.float32)

In [None]:
def reduce_columns(df):
    with open('../column_mapping.json', 'r') as f:
        column_mapping = json.load(f)
    # Important columns: YearsProgram, YearsCodedJob, Country, ImportantBenifits, CompanyType
    original_columns_to_keep = ['YearsProgram', 'YearsCodedJob', 'Country', 'ImportantBenefits', 'CompanyType', 'Salary']
    # def keep_columns(df, original_columns_to_keep):
    cleaned_columns_to_keep = []
    for original_col in original_columns_to_keep:
        cleaned_columns_to_keep += column_mapping[original_col]
    return df[cleaned_columns_to_keep].dropna()

In [None]:
filename_read = r"c:\Users\fr23505\Documents\machine\git\cleaned_data.csv"
df = pd.read_csv(filename_read,na_values=['NA','?'])
df = reduce_columns(df)

In [None]:
# Encode to a 2D matrix for training
# X: data
# Y: label
x,y = to_xy(df,'Salary')

In [None]:
# Split data into train/test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.50, random_state=42)

In [None]:
# Create the Neural Network
# Squential Network with 3 layers.
# Layer 1: 20 nodes using the rectifier
# Layer 2: 10 nodes using rectifier
# Layer 3: 1 node (regression)
# NN using optimization adam

# https://keras.io
# https://keras.io/layers/core/
# https://keras.io/losses/
# https://keras.io/optimizers/

# Optimizers
# SGD - stochastic gradient descent
# RMSprop
# Adagrad - Adaptive gradient
# Adadelta
# Adam - Adaptive Moment Estimation
# Adamax
# Nadam - Nesterov Adam optimizer

# Activation Functions
# softmax
# elu - Exponential Linear Unit
# selu - Scaled Exponential Linear Unit
# softplus
# softsign
# relu - rectified linear unit
# tanh - hyperbolic tangent
# sigmoid
# hard_sigmoid
# linear

num_neurons_1 = len(df.columns);
num_neurons_2 = round(len(df.columns)/2);
model = Sequential()
model.add(Dense(num_neurons_1, input_dim=x.shape[1], activation='linear'))
model.add(Dense(num_neurons_2, activation='linear'))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')

In [None]:
# Stop training when a monitored quantity has stopped improving.
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')

In [None]:
# save best model
checkpointer = ModelCheckpoint(filepath="best_weights.hdf5", verbose=0, save_best_only=True)

In [None]:
# Train NN
model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor,checkpointer],verbose=0,epochs=1000)

In [None]:
# load weights from best model (training above)
model.load_weights('best_weights.hdf5') 

In [None]:
# Prediction
pred = model.predict(x_test)

In [None]:
 # Calculate RMS
rms = np.sqrt(metrics.mean_squared_error(pred,y_test))
print("RMS: {}".format(rms))

In [None]:
# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

In [None]:
# Plot the chart
chart_regression(pred.flatten(),y_test)

In [None]:
chart_regression(pred.flatten(),y_test,sort=False)