In [1]:
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from keras import regularizers
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import KFold
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shutil
import os
 

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

    
# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd
    
    
# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()


#Read File
path="./data/"

filename_train = os.path.join(path,'train.csv')
filename_test = os.path.join(path,'test.csv')
filename_submit = os.path.join(path,'SampleResults3.csv')

df_train = pd.read_csv(filename_train,na_values=['NA','?'])


#Get corresponding type_count, forming new df_trian
encode_numeric_zscore(df_train,'0_type_count')
encode_numeric_zscore(df_train,'1_type_count')
encode_numeric_zscore(df_train,'2_type_count')
encode_numeric_zscore(df_train,'3_type_count')
encode_numeric_zscore(df_train,'4_type_count')
encode_text_dummy(df_train,'type')
headers = list(df_train.columns.values)
store_type = df_train[headers[17:22]]
store_count = df_train[headers[3:8]]
type_count = np.multiply(store_type,store_count)
type_count = np.mat(type_count)
type_count = type_count.sum(axis = 1)
a = pd.DataFrame(type_count,columns = ['type_count'])
df_train.insert(0,'type_count',(a['type_count']))
df_train.insert(1,'unit_pop',(df_train['population'] / df_train['sqmiles']))
df_train.drop(['id','zip','type_name','0_type_count','1_type_count','2_type_count',
               '3_type_count','4_type_count'],axis = 1,inplace=True)
df_train.drop(['lot_size','pets'],axis = 1,inplace=True)



# create feature vector
encode_numeric_zscore(df_train,'age')
encode_numeric_zscore(df_train,'sqft')
encode_numeric_zscore(df_train,'income')
encode_numeric_zscore(df_train,'population')
encode_numeric_zscore(df_train,'sqmiles')
encode_numeric_zscore(df_train,'unit_pop')
encode_numeric_zscore(df_train,'urban')


# Shuffle
np.random.seed(42)
df_train = df_train.reindex(np.random.permutation(df_train.index))
df_train.reset_index(inplace=True, drop=True)


#creation x and y
headers = list(df_train.columns.values)
x = np.hstack((np.mat(df_train[headers[0:8]]),np.mat(df_train[headers[9:14]])))
y = np.mat(df_train['sales']).T


# Cross validate
kf = KFold(5)

oos_y = []
oos_pred = []
fold = 1
for train, test in kf.split(x):
    print("Fold #{}".format(fold))
        
    x_train = x[train]
    y_train = y[train]
    x_test = x[test]
    y_test = y[test]
    
    model = Sequential()
    model.add(Dense(50, input_dim=x.shape[1], activation='relu')) # Hidden 1
    model.add(Dense(25, activation='relu')) # Hidden 2
    model.add(Dense(10, activation='relu'))# Hidden 3
    model.add(Dense(1)) # Output
    model.compile(loss='mean_squared_error', optimizer='adam')
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=50, verbose=1, mode='auto')
    model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],verbose=0,epochs=1000)
    
    pred = model.predict(x_test)    
    oos_y.append(y_test)
    oos_pred.append(pred)
    
# Measure this fold's RMSE
    score = np.sqrt(metrics.mean_squared_error(pred,y_test))
    print("Fold score (RMSE): {}".format(score))
    fold += 1

# Build the oos prediction list and calculate the error.
oos_y = np.concatenate(oos_y)
oos_pred = np.concatenate(oos_pred)
score = np.sqrt(metrics.mean_squared_error(oos_pred,oos_y))
print("Final, out of sample score (RMSE): {}".format(score))    


#read testfile
df_test=pd.read_csv(filename_test,na_values=['NA','?'])
id = df_test['id']


#Get corresponding type_count
encode_numeric_zscore(df_test,'0_type_count')
encode_numeric_zscore(df_test,'1_type_count')
encode_numeric_zscore(df_test,'2_type_count')
encode_numeric_zscore(df_test,'3_type_count')
encode_numeric_zscore(df_test,'4_type_count')
encode_text_dummy(df_test,'type')
headers = list(df_test.columns.values)
store_type = df_test[headers[16:21]]
store_count = df_test[headers[3:8]]
type_count = np.multiply(store_type,store_count)
type_count = np.mat(type_count)
type_count = type_count.sum(axis=1)
a = pd.DataFrame(type_count,columns = ['type_count'])
df_test.insert(0,'type_count',(a['type_count']))
df_test.insert(1,'unit_pop',(df_test['population'] / df_test['sqmiles']))
df_test.drop(['id','zip','type_name','0_type_count','1_type_count','2_type_count',
               '3_type_count','4_type_count'],axis = 1,inplace=True)
df_test.drop(['lot_size','pets'],axis = 1,inplace=True)



# create feature vector
encode_numeric_zscore(df_test,'age')
encode_numeric_zscore(df_test,'sqft')
encode_numeric_zscore(df_test,'income')
encode_numeric_zscore(df_test,'population')
encode_numeric_zscore(df_test,'sqmiles')
encode_numeric_zscore(df_test,'unit_pop')
encode_numeric_zscore(df_test,'urban')


# Create x for testing
x = np.mat(df_test)


#Result
finalpred = model.predict(x)
finalpred = pd.DataFrame(finalpred,columns=['sales'])
submit = pd.concat( [id, finalpred],axis=1 )
submit.to_csv(filename_submit,index=False)


Using TensorFlow backend.


Fold #1
Epoch 00122: early stopping
Fold score (RMSE): 0.2511269275466326
Fold #2
Epoch 00133: early stopping
Fold score (RMSE): 0.23419894841340022
Fold #3
Epoch 00124: early stopping
Fold score (RMSE): 0.24292102031053844
Fold #4
Epoch 00097: early stopping
Fold score (RMSE): 0.2371295632844142
Fold #5
Epoch 00176: early stopping
Fold score (RMSE): 0.2289230075801136
Final, out of sample score (RMSE): 0.23898145386981348
