In [1]:
# models to make today:
# model using only gender
# model using gender + class
# model using gender + class + age (binning)
# model using gender + class + age + name (synthetic feature)  (optional)
# make sure to attach tensorboard and screenshot training curve
import math
import numpy as np
import pandas as pd
import tensorflow as tf
from matplotlib import pyplot as plt
from sklearn import metrics
from tensorflow.python.data import Dataset

from IPython import display

# for sex_to_int() function
from sklearn.preprocessing import LabelEncoder

# make TensorFlow less verbose
tf.logging.set_verbosity(tf.logging.ERROR)

In [2]:
def get_file():
    train_df = pd.read_csv("../input/train.csv")
    test_df = pd.read_csv("../input/test.csv")
    return train_df, test_df

def drop_unused_cols(data_df):
    UNUSED_COLUMNS = ["Ticket", "Cabin", "Embarked", "Fare", "SibSp", "Parch", 'PassengerId']
    data_df = data_df.drop(UNUSED_COLUMNS, axis=1)
    return data_df

def fill_nan(data_df):
#     train_df.Age.fillna((train_df.Age.mean()), inplace=True)
    data_df.Age.fillna((data_df.Age.mean()), inplace=True)
    return data_df

def sex_to_int(data):
    le = LabelEncoder()
    le.fit(["male","female"])
    data["Sex"]=le.transform(data["Sex"]) 
    return data

def pclass_col(data):
    columns = ["Pclass"]
    for column in columns:
        data = pd.concat([data, pd.get_dummies(data[column], prefix=column)], axis=1)
        data = data.drop(column, axis=1)
    return data

def feature_eng(data_df):
    data_df = drop_unused_cols(data_df)
    data_df = fill_nan(data_df)
    data_df = sex_to_int(data_df)
    data_df = pclass_col(data_df)
    return data_df

# # bin Age
# def bin_age(dataframe, age_list):
#     dataframe['Age'] = pd.cut(dataframe['Age'], age_list)
#     return dataframe
# ages = [0, 20, 40, 60, 100]
# train_df = bin_age(train_df, ages)
# test_df = bin_age(test_df, ages)

In [3]:
def split_train_df(data_df):
    # sample 80% for train data
    train_set = data_df.sample(frac=0.8, replace=False, random_state=42)
    # the other 20% is reserved for cross validation
    cv_set = data_df.loc[ set(data_df.index) - set(train_set.index)]
    return train_set, cv_set

In [4]:
def get_quantile_based_boundaries(feature_values, num_buckets):
    boundaries = np.arange(1.0, num_buckets) / num_buckets
    quantiles = feature_values.quantile(boundaries)
    return [quantiles[q] for q in quantiles.keys()]

def construct_feature_columns(input_features, bin_age=False):
    tmp_feature = [tf.feature_column.numeric_column(my_feature) for my_feature in input_features]
    if bin_age:
        tmp_age = tf.feature_column.numeric_column("Age")
        bucketized_age = tf.feature_column.bucketized_column(
          tmp_age, boundaries=get_quantile_based_boundaries(
            input_features["Age"], 4))
        tmp_feature += [bucketized_age]
    return (set(tmp_feature))


def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
    # Convert pandas data into a dict of np arrays.
    features = {key:np.array(value) for key,value in dict(features).items()}                                           
 
    # Construct a dataset, and configure batching/repeating.
    ds = Dataset.from_tensor_slices((features,targets)) # warning: 2GB limit
    ds = ds.batch(batch_size).repeat(num_epochs)
    
    # Shuffle the data, if specified.
    if shuffle:
      ds = ds.shuffle(10000)
    
    # Return the next batch of data.
    features, labels = ds.make_one_shot_iterator().get_next()
    return features, labels

def create_optimizer(learning_rate):
    my_optimizer = tf.train.FtrlOptimizer(learning_rate=learning_rate)
    optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
    return optimizer

def create_linear_regressor(feature_columns, optimizer, model_dir):
    linear_regressor = tf.estimator.LinearRegressor(
        feature_columns=feature_columns,
        optimizer=optimizer,
        model_dir = model_dir
    )
    return linear_regressor


# feature_columns only use in create_linear_regressor, change the feature column -> feature will change
# training data do not have to change, still can build the new model by changing the feature column
def train_model(
    learning_rate,
    steps,
    batch_size,
    periods,
    training_examples,
    training_targets,
    validation_examples,
    validation_targets,
    bin_age,
    model_dir):
    
    steps_per_period = steps / periods
    
    feature_columns = construct_feature_columns(training_examples.copy(), bin_age)
    optimizer = create_optimizer(learning_rate)
    train_dir = model_dir + '/validation'
#     print(train_dir)
    linear_regressor = create_linear_regressor(feature_columns, optimizer, train_dir)
    
    vali_dir = model_dir + '/train'
    linear_vali_regressor = create_linear_regressor(feature_columns, optimizer, vali_dir)
    
    training_input_fn = lambda: my_input_fn(training_examples, 
                                          training_targets["Survived"], 
                                          batch_size=batch_size)
    
    training_vali_input_fn = lambda: my_input_fn(validation_examples, 
                                          validation_targets["Survived"], 
                                          batch_size=batch_size)
    
    predict_training_input_fn = lambda: my_input_fn(training_examples, 
                                                  training_targets["Survived"], 
                                                  num_epochs=1, 
                                                  shuffle=False)
    p_v_input_fn = lambda: my_input_fn(validation_examples, 
                                          validation_targets["Survived"], 
                                          num_epochs=1, 
                                          shuffle=False)
    predict_validation_input_fn = lambda: my_input_fn(validation_examples, 
                                                    validation_targets["Survived"], 
                                                    num_epochs=1, 
                                                    shuffle=False)
    
    
#     training_input_fn = tf.estimator.inputs.pandas_input_fn(
#         x=train_set.drop('Survived', axis=1),
#         y=train_set.Survived,
#         num_epochs=None, # for training, use as many epochs as necessary
#         shuffle=True,
#         target_column='target',
#         batch_size=batch_size
#     )
#     predict_input_fn = tf.estimator.inputs.pandas_input_fn(
#         x=cv_set.drop('Survived', axis=1),
#         y=cv_set.Survived,
#         num_epochs=1, # only to score
#         shuffle=False,
#         batch_size=batch_size
#     )
# #     predict_validation_input_fn
    
    
    print("Training model...")
    print("RMSE (on training data):")
    training_rmse = []
    validation_rmse = []
    
    for period in range (0, periods):
        # Train the model, starting from the prior state.
        linear_regressor.train(
            input_fn=training_input_fn,
            steps=steps_per_period
        )
        linear_vali_regressor.train(
            input_fn=training_vali_input_fn,
            steps=steps_per_period
        )
        
        # Take a break and compute predictions.
        training_predictions = linear_regressor.predict(input_fn=predict_training_input_fn)
        training_predictions = np.array([item['predictions'][0] for item in training_predictions])
        
        v_ps = linear_vali_regressor.predict(input_fn=p_v_input_fn)
        v_ps = np.array([item['predictions'][0] for item in v_ps])
        
#         validation_predictions = linear_regressor.predict(input_fn=predict_validation_input_fn)
#         validation_predictions = np.array([item['predictions'][0] for item in validation_predictions])
#         validation_eval = linear_regressor.evaluate(input_fn=predict_validation_input_fn)
#         print("validation_eval: {}".format(validation_eval['average_loss']))

        # Compute training and validation loss.
        training_root_mean_squared_error = math.sqrt(
            metrics.mean_squared_error(training_predictions, training_targets))
        
#         validation_root_mean_squared_error = math.sqrt(
#             metrics.mean_squared_error(validation_predictions, validation_targets))
        
        
        # Occasionally print the current loss.
        print("  period %02d : %0.2f" % (period, training_root_mean_squared_error))
        # Add the loss metrics from this period to our list.
        training_rmse.append(training_root_mean_squared_error)
#         validation_rmse.append(validation_root_mean_squared_error)
    print("Model training finished.")
    # Output a graph of loss metrics over periods.
#     plt.ylabel("RMSE")
#     plt.xlabel("Periods")
#     plt.title("Root Mean Squared Error vs. Periods")
#     plt.tight_layout()
#     plt.plot(training_rmse, label="training")
#     plt.plot(validation_rmse, label="validation")
#     plt.legend()

    return linear_regressor

In [5]:
def preprocess_features(data_df):
    processed_features = data_df.copy().drop('Survived', axis=1)
    return processed_features

def preprocess_targets(data_df):
    output_targets = pd.DataFrame()
    # Scale the target to be in units of thousands of dollars.
    output_targets["Survived"] = data_df["Survived"]
    return output_targets

def get_examples_targets(data_df):
    data_examples = preprocess_features(data_df)
    data_targets = preprocess_targets(data_df)
    return data_examples, data_targets

In [6]:
def init_model(
    learning_rate,
    steps,
    batch_size,
    periods,
    train_df,
    model_dir,
    bin_age=False):
    
#     learning_rate=0.2,
#     steps=500,
#     batch_size=100,
#     period=10,

#     train_df = get_file()
#     train_df = feature_eng(train_df)

#     test_df = feature_eng(test_df)
    train_set, cv_set = split_train_df(train_df)
    training_examples, training_targets = get_examples_targets(train_df)
    validation_examples, validation_targets = get_examples_targets(cv_set)
    linear_regressor = train_model(
        learning_rate=learning_rate,
        steps=steps,
        batch_size=batch_size,
        periods=periods,
#         feature_columns=construct_feature_columns(training_examples)
        training_examples=training_examples,
        training_targets=training_targets,
        validation_examples=validation_examples,
        validation_targets=validation_targets,
        bin_age=bin_age,
        model_dir=model_dir
    )
    return (linear_regressor)
    
# init()

In [7]:
train_df, test_df = get_file()
final_data = pd.read_csv("../input/gender_submission.csv")

In [11]:
# model 1, only gender
def get_sex_df(data_df):
    ret_df = data_df.copy()
    ret_df = feature_eng(ret_df)[['Survived', 'Sex']]
    return ret_df

def get_t_sex_df(data_df):
    ret_df = data_df.copy()
    ret_df = feature_eng(ret_df)[['Sex']]
    return ret_df

train_sex_df = get_sex_df(train_df)
test_sex_df = get_t_sex_df(test_df)
model_gender = init_model(
    learning_rate = 0.008,
    batch_size = 8,
    steps = 800,
    periods = 10,
    train_df = train_sex_df,
    model_dir = './tensorboard/train_vali_loss/model1_v6'
)

Training model...
RMSE (on training data):
  period 00 : 0.55
  period 01 : 0.54
  period 02 : 0.52
  period 03 : 0.52
  period 04 : 0.51
  period 05 : 0.50
  period 06 : 0.50
  period 07 : 0.49
  period 08 : 0.49
  period 09 : 0.49
Model training finished.


In [12]:
# model 2, gender & class
def get_gender_class_df(data_df):
    ret_df = data_df.copy()
    ret_df = feature_eng(ret_df)[['Survived', 'Sex', 'Pclass_1', 'Pclass_2', 'Pclass_3']]
    return ret_df

def get_t_gender_class_df(data_df):
    ret_df = data_df.copy()
    ret_df = feature_eng(ret_df)[['Sex', 'Pclass_1', 'Pclass_2', 'Pclass_3']]
    return ret_df

train_sex_class_df = get_gender_class_df(train_df)
test_sex_class_df = get_t_gender_class_df(test_df)
model_gender_class = init_model(
    learning_rate = 0.02,
    batch_size = 5,
    steps = 1000,
    periods = 15,
    train_df = train_sex_class_df,
    model_dir = './tensorboard/train_vali_loss/model2_v0'
)

Training model...
RMSE (on training data):
  period 00 : 0.48
  period 01 : 0.46
  period 02 : 0.44
  period 03 : 0.43
  period 04 : 0.42
  period 05 : 0.42
  period 06 : 0.41
  period 07 : 0.41
  period 08 : 0.41
  period 09 : 0.41
  period 10 : 0.40
  period 11 : 0.40
  period 12 : 0.40
  period 13 : 0.40
  period 14 : 0.40
Model training finished.


In [13]:
# model 3, gender & class & age
def get_gender_class_age_df(data_df):
    ret_df = data_df.copy()
    ret_df = feature_eng(ret_df)[['Survived', 'Sex', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Age']]
    return ret_df

def get_t_gender_class_age_df(data_df):
    ret_df = data_df.copy()
    ret_df = feature_eng(ret_df)[['Sex', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Age']]
    return ret_df

train_sex_class_age_df = get_gender_class_age_df(train_df)
test_sex_class_age_df = get_t_gender_class_age_df(test_df)
model_gender_class_age = init_model(
    learning_rate = 0.02,
    batch_size = 5,
    steps = 1000,
    periods = 15,
    train_df = train_sex_class_age_df,
    model_dir = './tensorboard/train_vali_loss/model3_v0',
    bin_age = True
)

Training model...
RMSE (on training data):
  period 00 : 0.51
  period 01 : 0.46
  period 02 : 0.44
  period 03 : 0.44
  period 04 : 0.42
  period 05 : 0.42
  period 06 : 0.42
  period 07 : 0.45
  period 08 : 0.41
  period 09 : 0.42
  period 10 : 0.44
  period 11 : 0.41
  period 12 : 0.41
  period 13 : 0.41
  period 14 : 0.41
Model training finished.


In [None]:
# t_df = train_df.copy()
# t_df = feature_eng(t_df)
# t_df

In [None]:
# run test.csv prediction & generate csv file
def predic_fn(x):
    if x > 0.6:
        return 1
    else:
        return 0
    
def calculate_accuracy(model, model_input_fn):
    predictions = list(model.predict(input_fn=model_input_fn))
    predictions = np.array([item['predictions'][0] for item in predictions])
    predictions = list(map(predic_fn, predictions))
#     print(*predictions)
#     print(final_data['Survived'])
    
    accuracy = pd.Series(predictions == final_data['Survived'])
    accuracy = accuracy.sum() / accuracy.count()
    return accuracy

def create_test_input_fn(test_example):
    test_input_fn = tf.estimator.inputs.pandas_input_fn(
          x=test_example,
          num_epochs=1, # only to predict
          shuffle=False 
    )
    return test_input_fn

m1_fn = create_test_input_fn(test_sex_df)
m2_fn = create_test_input_fn(test_sex_class_df)
m3_fn = create_test_input_fn(test_sex_class_age_df)

predict1 = calculate_accuracy(model_gender, m1_fn)
predict2 = calculate_accuracy(model_gender_class, m2_fn)
predict3 = calculate_accuracy(model_gender_class_age, m3_fn)
print("predict1 = {}".format(predict1))
print("predict2 = {}".format(predict2))
print("predict3 = {}".format(predict3))

In [74]:
predictions = list(model_gender_class.predict(input_fn=m2_fn))
predictions = np.array([item['predictions'][0] for item in predictions])
predictions = list(map(predic_fn, predictions))

final_accuracy = pd.Series(predictions == final_data['Survived'])
final_accuracy = final_accuracy.sum() / final_accuracy.count()
print(final_accuracy)

evaluation = test_df['PassengerId'].copy().to_frame()
evaluation["Survived"] = predictions
evaluation.to_csv("evaluation_submission.csv", index=False)
evaluation

0.8277511961722488


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,0
7,899,0
8,900,0
9,901,0
