In [15]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# LOAD DATA

In [27]:
trainDF = pd.read_csv("./Titanic/train.csv")
testDF = pd.read_csv("./Titanic/test.csv")
feature_columns = 'Sex Age SibSp Parch Ticket Fare Cabin Embarked'.split()
trainDF.sample(5)
x_trainDF, x_validationDF, y_trainDF, y_validationDF = train_test_split(trainDF[feature_columns], trainDF['Survived'], test_size=0.15, random_state=0)

## Feature Process

In [12]:
# data process
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, KFold
from itertools import product

class MeanEncoder:
    def __init__(self, categorical_features, n_splits=5, target_type='classification', prior_weight_func=None):
        """
        :param categorical_features: list of str, the name of the categorical columns to encode

        :param n_splits: the number of splits used in mean encoding

        :param target_type: str, 'regression' or 'classification'

        :param prior_weight_func:
        a function that takes in the number of observations, and outputs prior weight
        when a dict is passed, the default exponential decay function will be used:
        k: the number of observations needed for the posterior to be weighted equally as the prior
        f: larger f --> smaller slope
        """

        self.categorical_features = categorical_features
        self.n_splits = n_splits
        self.learned_stats = {}

        if target_type == 'classification':
            self.target_type = target_type
            self.target_values = []
        else:
            self.target_type = 'regression'
            self.target_values = None

        if isinstance(prior_weight_func, dict):
            self.prior_weight_func = eval('lambda x: 1 / (1 + np.exp((x - k) / f))', dict(prior_weight_func, np=np))
        elif callable(prior_weight_func):
            self.prior_weight_func = prior_weight_func
        else:
            self.prior_weight_func = lambda x: 1 / (1 + np.exp((x - 2) / 1))

    @staticmethod
    def mean_encode_subroutine(X_train, y_train, X_test, variable, target, prior_weight_func):
        X_train = X_train[[variable]].copy()
        X_test = X_test[[variable]].copy()

        if target is not None:
            nf_name = '{}_pred_{}'.format(variable, target)
            X_train['pred_temp'] = (y_train == target).astype(int)  # classification
        else:
            nf_name = '{}_pred'.format(variable)
            X_train['pred_temp'] = y_train  # regression
        prior = X_train['pred_temp'].mean()

        col_avg_y = X_train.groupby(by=variable, axis=0)['pred_temp'].agg({'mean': 'mean', 'beta': 'size'})
        col_avg_y['beta'] = prior_weight_func(col_avg_y['beta'])
        col_avg_y[nf_name] = col_avg_y['beta'] * prior + (1 - col_avg_y['beta']) * col_avg_y['mean']
        col_avg_y.drop(['beta', 'mean'], axis=1, inplace=True)

        nf_train = X_train.join(col_avg_y, on=variable)[nf_name].values
        nf_test = X_test.join(col_avg_y, on=variable).fillna(prior, inplace=False)[nf_name].values

        return nf_train, nf_test, prior, col_avg_y

    def fit_transform(self, X, y):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :param y: pandas Series or numpy array, n_samples
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
        if self.target_type == 'classification':
            skf = StratifiedKFold(self.n_splits)
        else:
            skf = KFold(self.n_splits)

        if self.target_type == 'classification':
            self.target_values = sorted(set(y))
            self.learned_stats = {'{}_pred_{}'.format(variable, target): [] for variable, target in
                                  product(self.categorical_features, self.target_values)}
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(y, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, target, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        else:
            self.learned_stats = {'{}_pred'.format(variable): [] for variable in self.categorical_features}
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(y, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, None, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        return X_new

    def transform(self, X):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()

        if self.target_type == 'classification':
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
        else:
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits

        return X_new


class oneHotEncoder:
    
    def __init__(self, threshold):
        self.threshold = threshold
    
    @staticmethod
    def binary_variance(p):
        return p * (1 - p)
    
    def dum_sign(self, df, col, threshold=0.01):
        dummy_col = df[col].fillna('')
        dummy_col = dummy_col.astype(str)
        p = dummy_col.value_counts() / dummy_col.shape[0]
        mask = dummy_col.isin(p[self.binary_variance(p) >= threshold].index)
        dummy_col[~mask] = np.nan
        res = pd.get_dummies(dummy_col, prefix=col, dummy_na=False, sparse = False)
        return res
    
    def one_hot_encoding(self, X, threshold):
        dfs = []
        for col in X.columns:
            if type(threshold) == float:
                t = threshold
            elif col in threshold:
                t = threshold[col]
            else:
                t = 0.0
            df = self.dum_sign(X, col, t)
            dfs.append(df)
        res = pd.concat(dfs, axis=1)
        return res
    
    def fit_transform(self, df):
        res = self.one_hot_encoding(df, self.threshold)
        self.columns = res.columns
        return res
    
    def transform(self, df):
        res = self.one_hot_encoding(df, 0.0)
        return res.reindex(columns = self.columns, fill_value=0)

In [31]:
c = oneHotEncoder(0.002)
trainX = c.fit_transform(x_trainDF)
validataionX = c.transform(x_validationDF)
trainY = y_trainDF
validataionY = y_validationDF

# model

'1.8.0'

In [41]:
#Init 
feature_size = trainX.shape[1]
class_size = 2
learning_rate = 0.1
training_epochs = 1000
batch_size = 32
display_step = 50
logs_path = "/Users/sam/workspace/code/tf-demo/tensorboard_log"

# tf Graph Input
x = tf.placeholder(tf.float32, [None, feature_size])
y = tf.placeholder(tf.float32, [None, class_size])

#Set model weights
W = tf.Variable(tf.zeros([feature_size, class_size]))
b = tf.Variable(tf.zeros([class_size]))

#construct model
pred = tf.nn.softmax(tf.matmul(x, W)+b)

#Minimize error using cross entropy
cost = tf.reduce_mean(-tf.reduce_mean(y*tf.log(pred), reduction_indices = 1))

#Gradient Descent
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)

#auc
auc_op, auc= tf.metrics.auc(y, pred)

#Initialize the variables 
# init = tf.global_variables_initializer()
init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.initialize_all_variables())

#tf.summary
tf.summary.scalar('cost', cost)
tf.summary.scalar('auc', auc)
merged_summary_op = tf.summary.merge_all()

In [36]:
def nextbatch(X, Y, batch, batch_size):
    batch_x = X[batch*batch_size:(batch+1)*batch_size].values
    batch_y = Y[batch*batch_size:(batch+1)*batch_size].values
    batch_y = [[i, 1-i] for i in batch_y]
    return batch_x, batch_y
    

In [43]:
with tf.Session() as sess:
    # Run the initializer
    sess.run(init)
    summary_writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph())
    
    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.
        total_batch = int(trainX.shape[0]/batch_size)
        # Loop over all batches
        for i in range(total_batch):
            batch_xs, batch_ys = nextbatch(trainX, trainY, i, batch_size)
            # Run optimization op (backprop) and cost op (to get loss value)
            _, c, auc_res, summary = sess.run([optimizer, cost, auc, merged_summary_op], feed_dict={x: batch_xs, y: batch_ys})
            summary_writer.add_summary(summary, epoch * total_batch + i)
            
            # Compute average loss
            avg_cost += c / total_batch
        # Display logs per epoch step
        if (epoch+1) % display_step == 0:
            validataion_xs, validataion_ys = nextbatch(validataionX, validataionY, 0, validataionX.shape[0])
            auc_test = sess.run([auc], feed_dict={x: validataion_xs, y: validataion_ys})
            print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(avg_cost), auc_res, auc_test)
            
    print("Optimization Finished!")

Epoch: 0050 cost= 0.208158092 0.87405694 [0.8741463]
Epoch: 0100 cost= 0.191077776 0.8907536 [0.890775]
Epoch: 0150 cost= 0.179655975 0.901127 [0.90112793]
Epoch: 0200 cost= 0.171356999 0.9085742 [0.9085672]
Epoch: 0250 cost= 0.164963420 0.9142791 [0.9142691]
Epoch: 0300 cost= 0.159830507 0.9188223 [0.9188109]
Epoch: 0350 cost= 0.155587102 0.92256695 [0.92255473]
Epoch: 0400 cost= 0.152002643 0.9257055 [0.9256931]
Epoch: 0450 cost= 0.148924483 0.92838305 [0.92837065]
Epoch: 0500 cost= 0.146246232 0.93071616 [0.9307039]
Epoch: 0550 cost= 0.143890694 0.9327563 [0.9327443]
Epoch: 0600 cost= 0.141800078 0.9345597 [0.9345479]
Epoch: 0650 cost= 0.139930029 0.9361751 [0.9361638]
Epoch: 0700 cost= 0.138245808 0.9376215 [0.93761057]
Epoch: 0750 cost= 0.136719774 0.9389407 [0.93893015]
Epoch: 0800 cost= 0.135329603 0.9401386 [0.9401282]
Epoch: 0850 cost= 0.134057082 0.9412407 [0.9412308]
Epoch: 0900 cost= 0.132887156 0.94225407 [0.9422445]
Epoch: 0950 cost= 0.131807299 0.94319355 [0.9431843]
Epo

[[0, 1],
 [0, 1],
 [0, 1],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [1, 0],
 [0, 1],
 [0, 1],
 [0, 1],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [0, 1],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [1, 0],
 [0, 1],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [0, 1],
 [0, 1],
 [0, 1],
 [1, 0],
 [0, 1],
 [0, 1],
 [0, 1],
 [1, 0],
 [0, 1],
 [0, 1],
 [1, 0],
 [0, 1],
 [0, 1],
 [1, 0],
 [1, 0],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [0, 1],
 [0, 1],
 [0, 1],
 [1, 0],
 [0, 1],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [0, 1],
 [0, 1],
 [0, 1],
 [1, 0],
 [0, 1],
 [0, 1],
 [0, 1],
 [0, 1],
 [0, 1],
 [1, 0],
 [0, 1],
 [0, 1],
 [0, 1],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [0, 1],
 [0, 1],
 [0, 1],
 [1, 0],
 [1, 0],
 [0, 1],
 [0, 1],
 [1, 0],
 [0, 1],
 [0, 1],
 [1, 0],
 [0, 1],
 [0, 1],
 [0, 1],
 [0, 1],
 [0, 1],
 [1, 0],
 [1, 0],
 [0, 1],
 [0, 1],
 [1, 0],
 [0, 1],
 