In [1]:
# Loading Data

import pandas as pd
import numpy as np

PATH="~/tensorflow_projekte/titanic/"

train_data=pd.read_csv(PATH + "data/train.csv", index_col=0)
test_data=pd.read_csv(PATH + "data/test.csv", index_col=0)

In [2]:
#Dataset and Variables

"""Variable	Definition	Key
survival 	Survival 	0 = No, 1 = Yes
pclass 	Ticket class 	1 = 1st, 2 = 2nd, 3 = 3rd
sex 	Sex 	
Age 	Age in years 	
sibsp 	# of siblings / spouses aboard the Titanic 	
parch 	# of parents / children aboard the Titanic 	
ticket 	Ticket number 	
fare 	Passenger fare 	
cabin 	Cabin number 	
embarked 	Port of Embarkation 	C = Cherbourg, Q = Queenstown, S = Southampton
"""
train_data

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [3]:
# Extract Title from Name

import re

train_data["titel"]=train_data["Name"].str.extract(', (.+?) ', expand=False)

train_data["titel_r"]="Else"
train_data.loc[train_data["titel"]=="Master.", "titel_r"]="Master"
train_data.loc[train_data["titel"]=="Miss.", "titel_r"]="Miss"
train_data.loc[train_data["titel"]=="Mr.", "titel_r"]="Mr"
train_data.loc[train_data["titel"]=="Mrs.", "titel_r"]="Mrs"

test_data["titel"]=test_data["Name"].str.extract(', (.+?) ', expand=False)

test_data["titel_r"]="Else"
test_data.loc[test_data["titel"]=="Master.", "titel_r"]="Master"
test_data.loc[test_data["titel"]=="Miss.", "titel_r"]="Miss"
test_data.loc[test_data["titel"]=="Mr.", "titel_r"]="Mr"
test_data.loc[test_data["titel"]=="Mrs.", "titel_r"]="Mrs"


In [4]:
#Encoding of Categorical Variables

from sklearn import preprocessing
le = preprocessing.LabelEncoder()

train_data["Cabin_Code"]=train_data["Cabin"].str[0]

train_data["Sex_r"]=le.fit_transform(train_data["Sex"])
train_data["Embarked_r"]=le.fit_transform(train_data["Embarked"].astype(str))
train_data["Cabin_Code_r"]=le.fit_transform(train_data["Cabin_Code"].astype(str))
train_data["titel_rr"]=le.fit_transform(train_data["titel_r"].astype(str))

test_data["Cabin_Code"]=test_data["Cabin"].str[0]

test_data["Sex_r"]=le.fit_transform(test_data["Sex"])
test_data["Embarked_r"]=le.fit_transform(test_data["Embarked"].astype(str))
test_data["Cabin_Code_r"]=le.fit_transform(test_data["Cabin_Code"].astype(str))
test_data["titel_rr"]=le.fit_transform(test_data["titel_r"].astype(str))
                                          

In [5]:
# Replace Missing Values by mean

for x in ["Pclass","Sex_r","Age","SibSp","Parch","Fare", "titel_rr"]:
    train_data[x].fillna(train_data[x].mean(), inplace=True)
    test_data[x].fillna(test_data[x].mean(), inplace=True)

In [6]:
# Add relatives (sum fo sibsp and parch)

train_data["relatives"]=train_data["SibSp"].add(train_data["Parch"])
test_data["relatives"]=test_data["SibSp"].add(test_data["Parch"])

In [7]:
## Final Dataset

train_data=train_data[["Survived","Pclass","Sex_r","Age","SibSp","Parch","Fare", "Embarked_r","Cabin_Code_r","titel_rr", "relatives"]].copy(deep=True)
test_data=test_data[["Pclass","Sex_r","Age","SibSp","Parch","Fare", "Embarked_r","Cabin_Code_r", "titel_rr", "relatives"]].copy(deep=True)

In [8]:
# Train - Test Split of training-data

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_data.iloc[:,1:], train_data.iloc[:,0], test_size=0.20, random_state=42)

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

clf_rf = RandomForestClassifier(max_depth=10, n_estimators=400, min_samples_split=4, random_state=0)
clf_rf.fit(X_train, y_train)
y_pred=clf_rf.predict(X_test)
print(classification_report(y_test, y_pred))
#print(clf.feature_importances_)

             precision    recall  f1-score   support

          0       0.85      0.88      0.86       105
          1       0.82      0.78      0.80        74

avg / total       0.84      0.84      0.84       179



In [10]:
# Grid Search
"""
from sklearn.model_selection import GridSearchCV

param_grid = [
  {'max_depth':[10,15,20,25,30], 'n_estimators':[10,50,80,100,200,400], 'min_samples_split':[2,4,6]}
 ]

grid_search = GridSearchCV(clf_rf, param_grid=param_grid)

grid_search.fit(X_train, y_train)
print(grid_search.best_params_ )
"""

"\nfrom sklearn.model_selection import GridSearchCV\n\nparam_grid = [\n  {'max_depth':[10,15,20,25,30], 'n_estimators':[10,50,80,100,200,400], 'min_samples_split':[2,4,6]}\n ]\n\ngrid_search = GridSearchCV(clf_rf, param_grid=param_grid)\n\ngrid_search.fit(X_train, y_train)\nprint(grid_search.best_params_ )\n"

In [11]:
from sklearn.ensemble import GradientBoostingClassifier
clf_gb = GradientBoostingClassifier(max_depth=15, n_estimators=400, min_samples_split=4, random_state=0)
clf_gb.fit(X_train, y_train)
y_pred=clf_gb.predict(X_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.82      0.85      0.83       105
          1       0.77      0.73      0.75        74

avg / total       0.80      0.80      0.80       179



In [12]:
# Grid Search
"""
from sklearn.model_selection import GridSearchCV

param_grid = [
  {'max_depth':[10,15,20,25,30], 'n_estimators':[10,50,80,100,200,400], 'min_samples_split':[2,4,6]}
 ]

grid_search = GridSearchCV(clf_gb, param_grid=param_grid)

grid_search.fit(X_train, y_train)
print(grid_search.best_params_ )
"""

"\nfrom sklearn.model_selection import GridSearchCV\n\nparam_grid = [\n  {'max_depth':[10,15,20,25,30], 'n_estimators':[10,50,80,100,200,400], 'min_samples_split':[2,4,6]}\n ]\n\ngrid_search = GridSearchCV(clf_gb, param_grid=param_grid)\n\ngrid_search.fit(X_train, y_train)\nprint(grid_search.best_params_ )\n"

In [13]:
# SUpporting vevor machine
from sklearn import svm
clf_svm = svm.SVC(kernel='linear', C=0.8)
clf_svm.fit(X_train, y_train)
y_pred=clf_svm.predict(X_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.80      0.84      0.82       105
          1       0.75      0.70      0.73        74

avg / total       0.78      0.78      0.78       179



In [14]:
# Grid Search
"""
from sklearn.model_selection import GridSearchCV

param_grid = [
  {'kernel':['linear',  'rbf'], 'C':[0.8,0.9,1,1.1,1.2]}
 ]

grid_search = GridSearchCV(clf_svm, param_grid=param_grid)

grid_search.fit(X_train, y_train)
print(grid_search.best_params_ )
"""

"\nfrom sklearn.model_selection import GridSearchCV\n\nparam_grid = [\n  {'kernel':['linear',  'rbf'], 'C':[0.8,0.9,1,1.1,1.2]}\n ]\n\ngrid_search = GridSearchCV(clf_svm, param_grid=param_grid)\n\ngrid_search.fit(X_train, y_train)\nprint(grid_search.best_params_ )\n"

In [15]:
from sklearn.linear_model import LogisticRegression
clf_lr = LogisticRegression()
clf_lr.fit(X_train, y_train)
y_pred=clf_lr.predict(X_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.80      0.85      0.82       105
          1       0.76      0.70      0.73        74

avg / total       0.79      0.79      0.79       179



In [16]:
# Voting Classifier
from sklearn.ensemble import VotingClassifier
votcl = VotingClassifier(estimators=[('lr', clf_lr), ('rf', clf_rf), ('gb', clf_gb)],voting='soft',  weights=[1,2,1])
votcl.fit(X_train, y_train)
y_pred=votcl.predict(X_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.84      0.88      0.86       105
          1       0.81      0.76      0.78        74

avg / total       0.83      0.83      0.83       179



  if diff:


In [17]:
test_data_pred=votcl.predict(test_data)

  if diff:


In [18]:
pd.DataFrame({'PassengerId': test_data.index, 'Survived': test_data_pred}).to_csv(PATH+"data/submit.csv", index=False)

In [None]:
### Now the same in tensor flow

In [22]:
import tensorflow as tf
# Feature columns describe how to use the input.
my_feature_columns = []
for key in X_train.columns:
    my_feature_columns.append(tf.feature_column.numeric_column(key=key))



In [46]:
my_feature_columns

[_NumericColumn(key='Pclass', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='Sex_r', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='Age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='SibSp', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='Parch', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='Fare', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='Embarked_r', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='Cabin_Code_r', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='titel_rr', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='relatives', shape=(1,), default_value=None, dtype=tf.float32, norm

In [125]:
    # Build 2 hidden layer DNN with 10, 10 units respectively.
    classifier = tf.estimator.DNNClassifier(
        feature_columns=my_feature_columns,
        # Two hidden layers of 10 nodes each.
        hidden_units=[100, 100,100,100],
        # The model must choose between 3 classes.
        n_classes=2)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmppu6msjw1', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f20ab2245f8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [126]:
def train_input_fn(features, labels, batch_size):
    """An input function for training"""
    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))

    # Shuffle, repeat, and batch the examples.
    return dataset.shuffle(1000).repeat().batch(batch_size)


In [133]:
    # Train the Model.
    classifier.train(
        input_fn=lambda:train_input_fn(X_train, y_train.values,
                                                 100),
        steps=2000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmppu6msjw1/model.ckpt-800
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 801 into /tmp/tmppu6msjw1/model.ckpt.
INFO:tensorflow:loss = 33.715298, step = 801
INFO:tensorflow:global_step/sec: 285.526
INFO:tensorflow:loss = 31.422068, step = 901 (0.352 sec)
INFO:tensorflow:global_step/sec: 349.601
INFO:tensorflow:loss = 32.366127, step = 1001 (0.288 sec)
INFO:tensorflow:global_step/sec: 348.328
INFO:tensorflow:loss = 32.736324, step = 1101 (0.287 sec)
INFO:tensorflow:global_step/sec: 348.155
INFO:tensorflow:loss = 24.612574, step = 1201 (0.285 sec)
INFO:tensorflow:global_step/sec: 354.924
INFO:tensorflow:loss = 32.24139, step = 1301 (0.282 sec)
INFO:tensorflow:global_step/sec: 341.334
INFO:tensorflow:loss = 34.39115, 

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x7f20c92947b8>

In [134]:

def eval_input_fn(features, labels, batch_size):
    """An input function for evaluation or prediction"""
    features=dict(features)
    if labels is None:
        # No labels, use only features.
        inputs = features
    else:
        inputs = (features, labels)

    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices(inputs)

    # Batch the examples
    assert batch_size is not None, "batch_size must not be None"
    dataset = dataset.batch(batch_size)

    # Return the dataset.
    return dataset

In [135]:
    eval_result = classifier.evaluate(
        input_fn=lambda:eval_input_fn(X_test, y_test,
                                                100))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-03-29-19:49:27
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmppu6msjw1/model.ckpt-2800
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-03-29-19:49:28
INFO:tensorflow:Saving dict for global step 2800: accuracy = 0.7932961, accuracy_baseline = 0.5865922, auc = 0.7879022, auc_precision_recall = 0.7779492, average_loss = 1.0213366, global_step = 2800, label/mean = 0.41340783, loss = 91.40962, prediction/mean = 0.41016144


In [136]:
print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))


Test set accuracy: 0.793



In [137]:
    predictions = classifier.predict(
        input_fn=lambda:eval_input_fn(test_data,
                                                labels=None,
                                                batch_size=100))

In [138]:
    out=[]
    for pred_dict in (predictions):
        out.append(pred_dict['class_ids'][0])
        #print(pred_dict['probabilities'])

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmppu6msjw1/model.ckpt-2800
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [139]:
pd.DataFrame({'PassengerId': test_data.index, 'Survived':out}).to_csv(PATH+"data/submit_tf.csv", index=False)