In [33]:
# models to make today:
# model using only gender
# model using gender + class
# model using gender + class + age (binning)
# model using gender + class + age + name (synthetic feature)  (optional)
# make sure to attach tensorboard and screenshot training curve
import math
import numpy as np
import pandas as pd
import tensorflow as tf
from matplotlib import pyplot as plt
from sklearn import metrics
from tensorflow.python.data import Dataset
from sklearn.model_selection import train_test_split

from IPython import display

# for sex_to_int() function
from sklearn.preprocessing import LabelEncoder

# make TensorFlow less verbose
tf.logging.set_verbosity(tf.logging.ERROR)

train_path = '../input/train.csv'
test_path = '../input/test.csv'

In [34]:
class get_data:
    def __init__(self, path, model_lst, is_train=True):
        self.model_lst = model_lst
        self.is_train = is_train
        
        self.feature = pd.read_csv(path)
        
        self.get_feature_label()
        self.deal_feature()
        
        self.get_model()
        self.split_t_v()
        
    def get_model(self):
        self.feature = self.feature[self.model_lst]
        self.get_dum()
        
    def get_dum(self):
        self.feature = pd.get_dummies(self.feature, dummy_na=True)
    
    def deal_feature(self):
        self.sex_to_int()
    
    def feature_eng(self):
        self.fill_nan()
        self.drop_cols()
        self.sex_to_int()
    
    def get_feature_label(self):
        if self.is_train:
            self.label = self.feature['Survived']
            self.feature = self.feature.drop('Survived', axis=1)
    
#     def fill_nan(self):
#         self.feature.fillna(0, inplace=True)
    
#     def drop_cols(self):
#         UNUSED_COLUMNS = ["Ticket", "Cabin", "Embarked", "Fare", "SibSp", "Parch", 'PassengerId']
#         self.feature = self.df.drop(UNUSED_COLUMNS, axis=1)
        
    def sex_to_int(self):
        gender = {'male': 1, 'female': 0}
        self.feature['Sex'] = [gender[item] for item in self.feature['Sex']]
#         self.feature['Sex'][self.feature['Sex'] == 'male'] = 1
#         self.feature['Sex'][self.feature['Sex'] == 'female'] = 0
        
#     def get_dum(self):
#         columns = ["Pclass"]
#         for column in columns:
#             data = pd.concat([data, pd.get_dummies(data[column], prefix=column)], axis=1)
#             data = data.drop(column, axis=1)
            
    def split_t_v(self):
        self.ts_f, self.vs_f, self.ts_l, self.vs_l = \
        train_test_split(
            self.feature,
            self.label,
            test_size=0.2,
            random_state=42
        )

In [75]:
class create_model:
    def __init__(
        self,
        feature,
        learning_rate=0.3,
        steps=10,
        periods=30,
        batch_size=100,
        save_model=True,
        load_model=False,
        save_model_dir='./default_model',
        load_model_dir='./default_model'
    ):
        self.feature = feature
        self.learning_rate = learning_rate
        self.steps = steps
        self.periods = periods
        self.batch_size = batch_size
        
        self.save_model = save_model
        self.save_model_dir = save_model_dir
        self.load_model = load_model
        self.load_model_dir = load_model_dir
        
        self.create_optimizer()
        
        # create model by dict
        model_par = dict(
            feature_columns = self.get_feature_cols(),
            optimizer = self.optimizer,
            n_classes = 2
        )
        if self.save_model:
            model_par['model_dir'] = self.save_model_dir
        if self.load_model:
            model_par['warm_start_from'] = self.load_model_dir
        self.model = tf.estimator.LinearClassifier(**model_par)
        
    
#         self.model = tf.estimator.LinearRegressor(
#             feature_columns = self.get_feature_cols(),
#             optimizer = self.optimizer
#         )
        print('create model!')
        
    def get_feature_cols(self):
        tmp_feature = [tf.feature_column.numeric_column(my_feature) for my_feature in self.feature]
#         if bin_age: # boolean:
#             tmp_age = tf.feature_column.numeric_column("Age")
#             bucketized_age = tf.feature_column.bucketized_column(
#               tmp_age, boundaries=get_quantile_based_boundaries(
#                 input_features["Age"], 4))
#             tmp_feature += [bucketized_age]
        return set(tmp_feature)
    
    def create_optimizer(self):
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
        
        '''
        self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate)
        self.optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_opt, 2.0)
        '''
    
    def train(
        self,
        ts_f,
        ts_l,
        vs_f,
        vs_l
    ):
        self.ts_f = ts_f
        self.ts_l = ts_l
        self.vs_f = vs_f
        self.vs_l = vs_l
        self.create_input_fn()
        
        steps_per_period = self.steps / self.periods
        print('start training...')
        for period in range(self.periods):
#             print(" period %02d start" % (period))
            train = self.model.train(
                input_fn=self.train_input_fn,
                steps=steps_per_period
            )
#             evaluate_train = self.model.evaluate(
#                 input_fn=self.train_eval_fn
#             )
            evaluate_vali = self.model.evaluate(
                input_fn=self.vali_eval_fn
            )
            
#             print(train)
#             print(evaluate_train)
            print(evaluate_vali)
            print('\n')
            
#             print(" period %02d finish" % (period))
        print("Model training finished.")
        
    def create_input_fn(self):
        self.train_input_fn = lambda: self.my_input_fn(self.ts_f, self.ts_l, batch_size=self.batch_size)
        self.train_eval_fn = lambda: self.my_input_fn(self.ts_f, self.ts_l, shuffle=True, num_epochs=1)
        self.vali_eval_fn = lambda: self.my_input_fn(self.vs_f, self.vs_l, shuffle=True, num_epochs=1)
        
    def my_input_fn(self, features, targets, batch_size=1, shuffle=True, num_epochs=None):
        # Convert pandas data into a dict of np arrays.
        features = {key:np.array(value) for key,value in dict(features).items()}
        # Construct a dataset, and configure batching/repeating.
        ds = Dataset.from_tensor_slices((features,targets)) # warning: 2GB limit
        ds = ds.batch(batch_size).repeat(num_epochs)
        # Shuffle the data, if specified.
        if shuffle:
            ds = ds.shuffle(42)
        # Return the next batch of data.
        features, labels = ds.make_one_shot_iterator().get_next()
        return features, labels
    
    def predict(self, test_df):
        print('predict')

In [76]:
model_gender = ['Sex']
model_gender_class = ['Sex', 'Pclass']
model_gender_class_age = ['Sex', 'Pclass', 'Age']
model_gender_class_age_name = ['Sex', 'Pclass', 'Age', 'Name']

data = get_data(train_path, model_gender)
# test = get_data(test_path)
# data.get_model(model_gender)
data.ts_f.head(5)
# data.label.head(5)
# print(data.feature['Sex'].isnull().count())
# print(data.feature['Pclass'].isnull().any())
# print(data.feature['Age'].isna().sum())
# print(data.feature['Name'].isnull().any())

Unnamed: 0,Sex
331,1
733,1
382,1
704,1
813,0


In [109]:
model = create_model(
    feature=data.feature,
    learning_rate=0.08,
#     save_model=False
    save_model_dir='./model/test_v10'
)

create model!


In [110]:
print(data.ts_f.count(), '\n')
print('shape', data.ts_f.shape, '\n')

print(data.ts_l.count(), '\n')
print('shape', data.ts_l.shape, '\n')

print(data.vs_f.count(), '\n')
print('shape', data.vs_f.shape, '\n')

print(data.vs_l.count(), '\n')
print('shape', data.vs_l.shape, '\n')


Sex    712
dtype: int64 

shape (712, 1) 

712 

shape (712,) 

Sex    179
dtype: int64 

shape (179, 1) 

179 

shape (179,) 



In [111]:
model.train(
    data.ts_f,
    data.ts_l,
    data.vs_f,
    data.vs_l
)

start training...
{'accuracy': 0.5865922, 'accuracy_baseline': 0.5865922, 'auc': 0.77039903, 'auc_precision_recall': 0.7896155, 'average_loss': 0.67374355, 'label/mean': 0.41340783, 'loss': 0.67374355, 'precision': 0.0, 'prediction/mean': 0.46776608, 'recall': 0.0, 'global_step': 1}


{'accuracy': 0.5865922, 'accuracy_baseline': 0.5865922, 'auc': 0.77039903, 'auc_precision_recall': 0.7896155, 'average_loss': 0.6616638, 'label/mean': 0.41340783, 'loss': 0.6616638, 'precision': 0.0, 'prediction/mean': 0.44262078, 'recall': 0.0, 'global_step': 2}


{'accuracy': 0.5865922, 'accuracy_baseline': 0.5865922, 'auc': 0.77039903, 'auc_precision_recall': 0.7896155, 'average_loss': 0.6518445, 'label/mean': 0.41340783, 'loss': 0.6518445, 'precision': 0.0, 'prediction/mean': 0.41630298, 'recall': 0.0, 'global_step': 3}


{'accuracy': 0.5865922, 'accuracy_baseline': 0.5865922, 'auc': 0.77039903, 'auc_precision_recall': 0.7896155, 'average_loss': 0.6443687, 'label/mean': 0.41340783, 'loss': 0.6443687, 

{'accuracy': 0.7821229, 'accuracy_baseline': 0.5865922, 'auc': 0.77039903, 'auc_precision_recall': 0.7896155, 'average_loss': 0.5434092, 'label/mean': 0.41340783, 'loss': 0.5434092, 'precision': 0.7536232, 'prediction/mean': 0.40599018, 'recall': 0.7027027, 'global_step': 32}


{'accuracy': 0.7821229, 'accuracy_baseline': 0.5865922, 'auc': 0.77039903, 'auc_precision_recall': 0.7896155, 'average_loss': 0.542071, 'label/mean': 0.41340783, 'loss': 0.542071, 'precision': 0.7536232, 'prediction/mean': 0.40645117, 'recall': 0.7027027, 'global_step': 33}


{'accuracy': 0.7821229, 'accuracy_baseline': 0.5865922, 'auc': 0.77039903, 'auc_precision_recall': 0.7896155, 'average_loss': 0.540905, 'label/mean': 0.41340783, 'loss': 0.540905, 'precision': 0.7536232, 'prediction/mean': 0.40631974, 'recall': 0.7027027, 'global_step': 34}


{'accuracy': 0.7821229, 'accuracy_baseline': 0.5865922, 'auc': 0.77039903, 'auc_precision_recall': 0.7896155, 'average_loss': 0.5398918, 'label/mean': 0.41340783, 'los