In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split

warnings.filterwarnings('ignore')

train_data = pd.read_csv("../data/titanic/train.csv")
test_data = pd.read_csv("../data/titanic/test.csv")

features = train_data.columns.values.tolist()
features.remove("Survived")
train_x = train_data[features]
train_y = train_data["Survived"]

full_x = [train_x, test_data]
from sklearn import preprocessing
def transform_feature(x,feature):
    le = preprocessing.LabelEncoder()
    le = le.fit(x[feature])
    x[feature] = le.fit_transform(x[feature])

  from ._conv import register_converters as _register_converters


In [2]:
np.set_printoptions(threshold=np.nan)

In [3]:
from sklearn import preprocessing
for data in full_x:
    age_filled = data.Age.dropna().median()
    data.Age = data.Age.fillna(age_filled)
    cabin_filled = data.Cabin.value_counts().index[0]
    data.Cabin = data.Cabin.fillna(cabin_filled)
    fare_filled = data.Fare.dropna().median()
    data.Fare = data.Fare.fillna(fare_filled)
    embarked_filled = data.Embarked.value_counts().index[0]
    data.Embarked = data.Embarked.fillna(embarked_filled)
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
    data['IsAlone'] = 0
    data['IsAlone'][data['FamilySize'] != 0] = 1
    transform_feature(data,'Sex')
    transform_feature(data,'Cabin')
    transform_feature(data,'Embarked')
    transform_feature(data,'Ticket')
    data.drop(['SibSp','Parch','Ticket'],axis = 1,inplace=True)
train_x.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,Fare,Cabin,Embarked,FamilySize,IsAlone
0,1,3,"Braund, Mr. Owen Harris",1,22.0,7.25,47,2,2,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,71.2833,81,0,2,1
2,3,3,"Heikkinen, Miss. Laina",0,26.0,7.925,47,2,1,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,53.1,55,2,2,1
4,5,3,"Allen, Mr. William Henry",1,35.0,8.05,47,2,1,1


In [4]:
train_x.Age[train_x.Age <= 10] = 0
train_x.Age[(train_x.Age > 10) & (train_x.Age <= 20)] = 1
train_x.Age[(train_x.Age <= 30) & (train_x.Age > 20)] = 2
train_x.Age[(train_x.Age <= 40) & (train_x.Age > 30)] = 3
train_x.Age[(train_x.Age <= 50) & (train_x.Age > 40)] = 4
train_x.Age[(train_x.Age <= 60) & (train_x.Age > 50)] = 5
train_x.Age[(train_x.Age <= 70) & (train_x.Age > 60)] = 6
train_x.Age[train_x.Age > 70] = 7
test_data.Age[test_data.Age <= 10] = 0
test_data.Age[(test_data.Age > 10) & (test_data.Age <= 20)] = 1
test_data.Age[(test_data.Age <= 30) & (test_data.Age > 20)] = 2
test_data.Age[(test_data.Age <= 40) & (test_data.Age > 30)] = 3
test_data.Age[(test_data.Age <= 50) & (test_data.Age > 40)] = 4
test_data.Age[(test_data.Age <= 60) & (test_data.Age > 50)] = 5
test_data.Age[(test_data.Age <= 70) & (test_data.Age > 60)] = 6
test_data.Age[test_data.Age > 70] = 7

bins = (-1, 0, 8, 15, 31, 1000)
group_names = ['Unknown', '1_quartile', '2_quartile', '3_quartile', '4_quartile']
categories = pd.cut(train_x.Fare, bins, labels=group_names)

train_x.Fare = categories
transform_feature(train_x,'Fare')

test_data.Fare = categories
test_data.Fare.value_counts()
transform_feature(test_data,'Fare')

In [5]:
combine = [train_x, test_data]
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

In [6]:
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)

In [7]:
train_x = pd.concat([train_x,pd.get_dummies(train_x.Pclass,prefix="Pclass")],axis=1)
test_data = pd.concat([test_data,pd.get_dummies(test_data.Pclass,prefix="Pclass")],axis=1)
train_x = pd.concat([train_x,pd.get_dummies(train_x.Embarked,prefix="Embarked")],axis=1)
test_data = pd.concat([test_data,pd.get_dummies(test_data.Embarked,prefix="Embarked")],axis=1)
train_x.drop(['Pclass','Embarked','Name','Title'],axis=1,inplace=True)
test_data.drop(['Pclass','Embarked','Name','Title'],axis=1,inplace=True)

train_x,val_x,train_y,val_y = train_test_split(train_x,train_y)

In [8]:
class mynn(object):
    def creat_model(self):
        for i in range(self.n_layers_):
            self.weights.append(tf.Variable(tf.truncated_normal([self.n_weights_[i],self.n_weights_[i+1]],mean=0,stddev=5)))
            self.bias.append(tf.Variable(tf.constant(0.1,shape=[self.n_weights_[i+1]])))
            if i == 0 :
                self.xs.append(tf.nn.relu(tf.matmul(self.X,self.weights[i]) + self.bias[i]))
            else:
                self.xs.append(tf.nn.relu(tf.matmul(self.xs[i-1],self.weights[i]) + self.bias[i]))
        self.weights.append(tf.Variable(tf.truncated_normal([self.n_weights_[self.n_layers_],2],mean=0,stddev=0.1)))
        self.bias.append(tf.Variable(tf.constant(0.1,shape=[2,])))
        if self.n_layers_ == 0:
            self.pre_y = tf.matmul(self.X,self.weights[self.n_layers_]) + self.bias[self.n_layers_]
        else:
            self.pre_y = tf.matmul(self.xs[self.n_layers_-1],self.weights[self.n_layers_]) + self.bias[self.n_layers_]
        self.pre_y_o = tf.nn.softmax(self.pre_y)
    def __init__(self,n_layers,n_weights):
        self.n_layers_ = n_layers
        self.n_weights_ = [13]
        self.weights = list()
        self.bias = list()
        self.xs = list()
        self.n_weights_ += n_weights    
        self.X = tf.placeholder(tf.float32, shape=(None, 13))
        self.Y = tf.placeholder(tf.float32, shape=(None,2))
        self.creat_model()
        self.loss()
        self.global_iter = tf.Variable(0, name='global_iter', trainable=False)
        self.lr = tf.train.polynomial_decay(0.0001, self.global_iter, 2000, end_learning_rate=0.0, power=0.9)
        self.optimizer = tf.train.MomentumOptimizer(self.lr, 0.9)
        self.train_op = tf.train.AdamOptimizer(0.001).minimize(self.total_loss)
        #self.train_op = self.optimizer.minimize(self.total_loss, global_step=self.global_iter)
    def loss(self):
        self.total_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.Y,logits=self.pre_y))

In [9]:
train_t = train_y.map(lambda x: 0 if x == 1 else 1)
train_y = pd.DataFrame({'a':train_t, 'b':train_y})
#train_y = train_y.reshape([891,-1])

In [10]:
train_m = val_y.map(lambda x: 0 if x == 1 else 1)
val_y = pd.DataFrame({'a':train_m, 'b':val_y})

In [47]:
model = mynn(1,[6])
sess = tf.Session()
sess.run(tf.global_variables_initializer())
for i in range(50000):
    #for j in range(len(train_x)):
    #    a,b = sess.run([model.train_op,model.total_loss],feed_dict={model.X: train_x.values[j].reshape([-1,13]),model.Y:train_y.values[j].reshape([-1,2])})
    a,b=sess.run([model.train_op,model.total_loss],feed_dict={model.X: train_x,model.Y:train_y.values.reshape([-1,2])})
    if i%1000==0:
        print(b)

79.21516
0.7613882
0.6748856
0.61970806
0.58511794
0.5631131
0.5468146
0.53368294
0.5229117
0.5141243
0.50706196
0.50144845
0.49698165
0.49363485
0.49093956
0.48895246
0.48737478
0.4861859
0.4852468
0.48449585
0.4839121
0.48347446
0.4831488
0.4828233
0.48260868
0.4823298
0.48224083
0.48204514
0.48195767
0.48184487
0.48176864
0.48162198
0.48143604
0.48137796
0.48126754
0.48114324
0.48110887
0.48099884
0.48091257
0.4808579
0.48081544
0.48076606
0.48077017
0.48058903
0.4806311
0.48051727
0.48036742
0.4802542
0.4802273
0.48026714


In [48]:
pre_y1 = sess.run(model.pre_y_o,feed_dict={model.X: train_x})
pre_1 = pre_y1.argmax(axis=1)
pre_y2 = sess.run(model.pre_y_o,feed_dict={model.X: val_x})
pre_2 = pre_y2.argmax(axis=1)

In [50]:
a=(pre_1==train_y.values.transpose([1,0])[1])
b=(pre_2==val_y.values.transpose([1,0])[1])
np.count_nonzero(a)/668,np.count_nonzero(b)/223

(0.8143712574850299, 0.7847533632286996)

In [52]:
pre_y = sess.run(model.pre_y_o,feed_dict={model.X: test_data})
pre_0 = pre_y.argmax(axis=1)
print(pre_0.shape)
result = pd.DataFrame({'PassengerId':test_data.PassengerId,'Survived':pre_0})
result.to_csv("submission.csv",index=False)

(418,)
