# ベース

In [4]:
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing

# Parameters
learning_rate = 0.001 # 学習率 高いとcostの収束が早まる
training_epochs = 10 # 学習全体をこのエポック数で区切り、区切りごとにcostを表示する
batch_size = 100     # 学習1回ごと( sess.run()ごと )に訓練データをいくつ利用するか
display_step = 1     # 1なら毎エポックごとにcostを表示
train_size = 800     # 全データの中でいくつ訓練データに回すか
step_size = 2000     # 何ステップ学習するか

# Network Parameters
n_hidden_1 = 64      # 隠れ層1のユニットの数
n_hidden_2 = 64      # 隠れ層2のユニットの数
n_input = 8          # 与える変数の数
n_classes = 2        # 分類するクラスの数 今回は生き残ったか否かなので2



data_dir = "/Users/chiehayashida/work/kaggle/titanic"
train_file = data_dir + '/train.csv'
test_file = data_dir + '/test.csv'

# データの読み込み
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)

all = pd.concat([train, test])


# 値の欠落を埋める
train["Age"].fillna(train.Age.median(), inplace=True) 
train["Fare"].fillna(train.Age.median(), inplace=True) 
train.Cabin.fillna("NaN", inplace=True)
train.Embarked.fillna("NaN", inplace=True)

test["Age"].fillna(train.Age.median(), inplace=True) 
test["Fare"].fillna(train.Age.median(), inplace=True) 
test.Cabin.fillna("NaN", inplace=True)
test.Embarked.fillna("NaN", inplace=True)

all = pd.concat([train, test])

# ラベルエンコーダの宣言
le_sex_tr = preprocessing.LabelEncoder()
le_cabin_tr = preprocessing.LabelEncoder()
le_embarked_tr = preprocessing.LabelEncoder()

le_sex_model = le_sex_tr.fit(all['Sex'])
le_cabin_model = le_cabin_tr.fit(all['Cabin'])
le_embarked_model = le_embarked_tr.fit(all['Embarked'])

# ラベルエンコーディング
train['Sex'] = le_sex_model.transform(train['Sex'])
train['Cabin'] = le_cabin_tr.transform(train.Cabin)
train['Embarked'] = le_embarked_tr.transform(train.Embarked)

test['Sex'] = le_sex_model.transform(test['Sex'])
test['Cabin'] = le_cabin_tr.transform(test.Cabin)
test['Embarked'] = le_embarked_tr.transform(test.Embarked)

# 特徴量データをnp.arrayに整形
x_np = train[['Age', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']].values
x_test_np = test[['Age', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']].values.astype(np.float32)

# クラスデータをOneHotエンコーディングしてnp.arrayに整形
oe = preprocessing.OneHotEncoder(sparse=False)
y_np= oe.fit_transform(train[['Survived']])

# 訓練データと試験データに分ける
[x_train, x_test] = np.vsplit(x_np, [train_size])
[y_train, y_test] = np.vsplit(y_np, [train_size])

# tf Graph input
x = tf.placeholder("float", [None, n_input])
x_ = tf.constant(x_test_np)
#y = tf.placeholder(tf.float32, [None, n_classes])
y = tf.placeholder("float", [None, n_classes])
y_ = tf.placeholder("float", [None, n_classes])
keep_prob = tf.placeholder(tf.float32)

# ニューラルネットワークによるモデルの定義
def multilayer_perceptron(x, weights, biases):
    # Hidden layer with RELU activation
    layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
    layer_1 = tf.nn.relu(layer_1)
    
    # dropout

    layer_1_drop = tf.nn.dropout(layer_1, keep_prob)
    
    # Hidden layer with RELU activation
    layer_2 = tf.add(tf.matmul(layer_1_drop, weights['h2']), biases['b2'])
    layer_2 = tf.nn.relu(layer_2)
    
    # Output layer with linear activation
    out_layer = tf.matmul(layer_2, weights['out']) + biases['out']
    return out_layer

# 重みとバイアスの定義
weights = {
    'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes]))
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1])),
    'b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_classes]))
}

# 予測値の定義
pred = multilayer_perceptron(x, weights, biases)

# 損失関数とオプティマイザの定義
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

# 値の初期化
init = tf.initialize_all_variables()

test_prediction = tf.nn.softmax(multilayer_perceptron(x_, weights, biases))

# 学習実行
with tf.Session() as sess:
    sess.run(init)

    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.

        # Loop over step_size
        for i in range(step_size):
            # 訓練データから batch_size で指定した数をランダムに取得
            ind = np.random.choice(train_size, batch_size)
            x_train_batch = x_np[ind]
            y_train_batch = y_np[ind]
            x_train_batch
    # Run optimization op (backprop) and cost op (to get loss value)
            _, c = sess.run([optimizer, cost], feed_dict={x: x_train_batch, y: y_train_batch, keep_prob: 0.5})
            # Compute average loss
            avg_cost += c / step_size
        # Display logs per epoch step
        if epoch % display_step == 0:
            print("Epoch:", '%04d' % (epoch+1), "cost=", \
                "{:.9f}".format(avg_cost))
    saver = tf.train.Saver()
    saver.save(sess, "model.ckpt")
    print("Optimization Finished!")
    #test_prediction.eval()

   ## モデルの評価
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
    # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    print("Accuracy:", accuracy.eval({x: x_test, y: y_test, keep_prob: 1.0}))


Instructions for updating:
Use `tf.global_variables_initializer` instead.
Epoch: 0001 cost= 533.373213249
Epoch: 0002 cost= 84.698960343
Epoch: 0003 cost= 22.134174088
Epoch: 0004 cost= 5.108635333
Epoch: 0005 cost= 2.263418975
Epoch: 0006 cost= 1.333921925
Epoch: 0007 cost= 0.892475556
Epoch: 0008 cost= 0.681980373
Epoch: 0009 cost= 0.596151590
Epoch: 0010 cost= 0.555017511
Optimization Finished!
Accuracy: 0.835165


# test_prediction.run()

# Cabin抜いて学習

In [7]:
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing

# Parameters
learning_rate = 0.001 # 学習率 高いとcostの収束が早まる
training_epochs = 10 # 学習全体をこのエポック数で区切り、区切りごとにcostを表示する
batch_size = 100     # 学習1回ごと( sess.run()ごと )に訓練データをいくつ利用するか
display_step = 1     # 1なら毎エポックごとにcostを表示
train_size = 800     # 全データの中でいくつ訓練データに回すか
step_size = 2000     # 何ステップ学習するか

# Network Parameters
n_hidden_1 = 64      # 隠れ層1のユニットの数
n_hidden_2 = 64      # 隠れ層2のユニットの数
n_input = 7          # 与える変数の数
n_classes = 2        # 分類するクラスの数 今回は生き残ったか否かなので2



data_dir = "/home/chie8842/share/titanic"
train_file = data_dir + '/train.csv'
test_file = data_dir + '/test.csv'

# データの読み込み
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)


# 値の欠落を埋める
train["Age"].fillna(train.Age.median(), inplace=True) 
train["Fare"].fillna(train.Age.median(), inplace=True) 
train.Cabin.fillna("NaN", inplace=True)
train.Embarked.fillna("NaN", inplace=True)

# ラベルエンコーダの宣言
le_sex_tr = preprocessing.LabelEncoder()
le_cabin_tr = preprocessing.LabelEncoder()
le_embarked_tr = preprocessing.LabelEncoder()

# ラベルエンコーディング
train['Sex'] = le_sex_tr.fit_transform(train['Sex'])
train['Cabin'] = le_cabin_tr.fit_transform(train.Cabin)
train['Embarked'] = le_embarked_tr.fit_transform(train.Embarked)

# 特徴量データをnp.arrayに整形
x_np = train[['Age', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked']].values

# クラスデータをOneHotエンコーディングしてnp.arrayに整形
oe = preprocessing.OneHotEncoder(sparse=False)
y_np = oe.fit_transform(train[['Survived']])

# 訓練データと試験データに分ける
[x_train, x_test] = np.vsplit(x_np, [train_size])
[y_train, y_test] = np.vsplit(y_np, [train_size])

# tf Graph input
x = tf.placeholder("float", [None, n_input])
#y = tf.placeholder(tf.float32, [None, n_classes])
y = tf.placeholder("float", [None, n_classes])
keep_prob = tf.placeholder(tf.float32)

# ニューラルネットワークによるモデルの定義
def multilayer_perceptron(x, weights, biases):
    # Hidden layer with RELU activation
    layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
    layer_1 = tf.nn.relu(layer_1)
    
    # dropout

    layer_1_drop = tf.nn.dropout(layer_1, keep_prob)
    
    # Hidden layer with RELU activation
    layer_2 = tf.add(tf.matmul(layer_1_drop, weights['h2']), biases['b2'])
    layer_2 = tf.nn.relu(layer_2)
    
    # Output layer with linear activation
    out_layer = tf.matmul(layer_2, weights['out']) + biases['out']
    return out_layer

# 重みとバイアスの定義
weights = {
    'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes]))
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1])),
    'b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_classes]))
}

# 予測値の定義
pred = multilayer_perceptron(x, weights, biases)

# 損失関数とオプティマイザの定義
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

# 値の初期化
init = tf.initialize_all_variables()

# 学習実行
with tf.Session() as sess:
    sess.run(init)

    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.

        # Loop over step_size
        for i in range(step_size):
            # 訓練データから batch_size で指定した数をランダムに取得
            ind = np.random.choice(train_size, batch_size)
            x_train_batch = x_train[ind]
            y_train_batch = y_train[ind]
            x_train_batch
    # Run optimization op (backprop) and cost op (to get loss value)
            _, c = sess.run([optimizer, cost], feed_dict={x: x_train_batch, y: y_train_batch, keep_prob: 1.0})
            # Compute average loss
            avg_cost += c / step_size
        # Display logs per epoch step
        if epoch % display_step == 0:
            print("Epoch:", '%04d' % (epoch+1), "cost=", \
                "{:.9f}".format(avg_cost))
    print("Optimization Finished!")

    # モデルの評価
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
    # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    print("Accuracy:", accuracy.eval({x: x_test, y: y_test, keep_prob: 1.0}))

Epoch: 0001 cost= 8.717519956
Epoch: 0002 cost= 1.593550263
Epoch: 0003 cost= 1.302829687
Epoch: 0004 cost= 1.150671896
Epoch: 0005 cost= 1.058887725
Epoch: 0006 cost= 1.022771144
Epoch: 0007 cost= 1.038750036
Epoch: 0008 cost= 0.939835104
Epoch: 0009 cost= 0.956257885
Epoch: 0010 cost= 0.959241067
Optimization Finished!
Accuracy: 0.78022


# Ageが入ってないの抜いて学習

In [1]:
import random
import math
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing

# Parameters
learning_rate = 0.001 # 学習率 高いとcostの収束が早まる
training_epochs = 10 # 学習全体をこのエポック数で区切り、区切りごとにcostを表示する
batch_size = 100     # 学習1回ごと( sess.run()ごと )に訓練データをいくつ利用するか
display_step = 1     # 1なら毎エポックごとにcostを表示
train_size = 650     # 全データの中でいくつ訓練データに回すか
step_size = 2000     # 何ステップ学習するか

# Network Parameters
n_hidden_1 = 64      # 隠れ層1のユニットの数
n_hidden_2 = 64      # 隠れ層2のユニットの数
n_input = 8          # 与える変数の数
n_classes = 2        # 分類するクラスの数 今回は生き残ったか否かなので2

data_dir = "/home/chie8842/share/titanic"
train_file = data_dir + '/train.csv'
test_file = data_dir + '/test.csv'

# データの読み込み
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)

# 値の欠落を埋める
train["Fare"].fillna(train.Age.median(), inplace=True) 
train.Cabin.fillna("NaN", inplace=True)
train.Embarked.fillna("NaN", inplace=True)

# ラベルエンコーダの宣言
le_sex_tr = preprocessing.LabelEncoder()
le_cabin_tr = preprocessing.LabelEncoder()
le_embarked_tr = preprocessing.LabelEncoder()

# ラベルエンコーディング
train['Sex'] = le_sex_tr.fit_transform(train['Sex'])
train['Cabin'] = le_cabin_tr.fit_transform(train.Cabin)
train['Embarked'] = le_embarked_tr.fit_transform(train.Embarked)

age_mean = train['Age'].mean()
n = np.nan
train_AgeNa = train.query('(Age != Age)')
train_AgeNa.fillna(math.floor(age_mean), inplace=True)


train = train.dropna(subset=['Age'])

# 特徴量データをnp.arrayに整形
x_np = train[['Age', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']].values
x_np_agena = train_AgeNa[['Age', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']].values

# クラスデータをOneHotエンコーディングしてnp.arrayに整形
oe = preprocessing.OneHotEncoder(sparse=False)
y_np = oe.fit_transform(train[['Survived']])
y_np_agena = oe.transform(train_AgeNa[['Survived']])

# 訓練データと試験データに分ける
[x_train, x_test] = np.vsplit(x_np, [train_size])
[y_train, y_test] = np.vsplit(y_np, [train_size])

x_test = np.concatenate((x_test, x_np_agena), axis=0)
y_test = np.concatenate((y_test, y_np_agena), axis=0)



ImportError: No module named 'tensorflow'

In [17]:
# tf Graph input
x = tf.placeholder("float", [None, n_input])
#y = tf.placeholder(tf.float32, [None, n_classes])
y = tf.placeholder("float", [None, n_classes])
keep_prob = tf.placeholder(tf.float32)

# ニューラルネットワークによるモデルの定義
def multilayer_perceptron(x, weights, biases):
    # Hidden layer with RELU activation
    layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
    layer_1 = tf.nn.relu(layer_1)
    
    # dropout

    layer_1_drop = tf.nn.dropout(layer_1, keep_prob)
    
    # Hidden layer with RELU activation
    layer_2 = tf.add(tf.matmul(layer_1_drop, weights['h2']), biases['b2'])
    layer_2 = tf.nn.relu(layer_2)
    
    # Output layer with linear activation
    out_layer = tf.matmul(layer_2, weights['out']) + biases['out']
    return out_layer

# 重みとバイアスの定義
weights = {
    'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes]))
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1])),
    'b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_classes]))
}

# 予測値の定義
pred = multilayer_perceptron(x, weights, biases)

# 損失関数とオプティマイザの定義
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

# 値の初期化
init = tf.initialize_all_variables()

# 学習実行
with tf.Session() as sess:
    sess.run(init)

    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.

        # Loop over step_size
        for i in range(step_size):
            # 訓練データから batch_size で指定した数をランダムに取得
            ind = np.random.choice(train_size, batch_size)
            x_train_batch = x_train[ind]
            y_train_batch = y_train[ind]
            x_train_batch
    # Run optimization op (backprop) and cost op (to get loss value)
            _, c = sess.run([optimizer, cost], feed_dict={x: x_train_batch, y: y_train_batch, keep_prob: 1.0})
            # Compute average loss
            avg_cost += c / step_size
        # Display logs per epoch step
        if epoch % display_step == 0:
            print("Epoch:", '%04d' % (epoch+1), "cost=", \
                "{:.9f}".format(avg_cost))
    print("Optimization Finished!")

    # モデルの評価
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
    # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    print("Accuracy:", accuracy.eval({x: x_test, y: y_test, keep_prob: 1.0}))

Epoch: 0001 cost= 22.638369894
Epoch: 0002 cost= 2.460456284
Epoch: 0003 cost= 2.132185943
Epoch: 0004 cost= 1.968187981
Epoch: 0005 cost= 1.804385882
Epoch: 0006 cost= 1.824740359
Epoch: 0007 cost= 1.923468604
Epoch: 0008 cost= 1.802301973
Epoch: 0009 cost= 1.833400029
Epoch: 0010 cost= 1.731921375
Optimization Finished!
Accuracy: 0.804979


# 年齢がわからない人は99才にする

In [3]:
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing

# Parameters
learning_rate = 0.001 # 学習率 高いとcostの収束が早まる
training_epochs = 10 # 学習全体をこのエポック数で区切り、区切りごとにcostを表示する
batch_size = 100     # 学習1回ごと( sess.run()ごと )に訓練データをいくつ利用するか
display_step = 1     # 1なら毎エポックごとにcostを表示
train_size = 650     # 全データの中でいくつ訓練データに回すか
step_size = 2000     # 何ステップ学習するか

# Network Parameters
n_hidden_1 = 64      # 隠れ層1のユニットの数
n_hidden_2 = 64      # 隠れ層2のユニットの数
n_input = 8          # 与える変数の数
n_classes = 2        # 分類するクラスの数 今回は生き残ったか否かなので2
data_dir = "/home/chie8842/share/titanic"
train_file = data_dir + '/train.csv'
test_file = data_dir + '/test.csv'

# データの読み込み
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)


# 値の欠落を埋める
train["Age"].fillna(99, inplace=True) 
train["Fare"].fillna(train.Age.median(), inplace=True) 
train.Cabin.fillna("NaN", inplace=True)
train.Embarked.fillna("NaN", inplace=True)


# ラベルエンコーダの宣言
le_sex_tr = preprocessing.LabelEncoder()
le_cabin_tr = preprocessing.LabelEncoder()
le_embarked_tr = preprocessing.LabelEncoder()

# ラベルエンコーディング
train['Sex'] = le_sex_tr.fit_transform(train['Sex'])
train['Cabin'] = le_cabin_tr.fit_transform(train.Cabin)
train['Embarked'] = le_embarked_tr.fit_transform(train.Embarked)

# 特徴量データをnp.arrayに整形
x_np = train[['Age', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']].values

# クラスデータをOneHotエンコーディングしてnp.arrayに整形
oe = preprocessing.OneHotEncoder(sparse=False)
y_np = oe.fit_transform(train[['Survived']])

# 訓練データと試験データに分ける
[x_train, x_test] = np.vsplit(x_np, [train_size])
[y_train, y_test] = np.vsplit(y_np, [train_size])

# tf Graph input
x = tf.placeholder("float", [None, n_input])
#y = tf.placeholder(tf.float32, [None, n_classes])
y = tf.placeholder("float", [None, n_classes])
keep_prob = tf.placeholder(tf.float32)

# ニューラルネットワークによるモデルの定義
def multilayer_perceptron(x, weights, biases):
    # Hidden layer with RELU activation
    layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
    layer_1 = tf.nn.relu(layer_1)
    
    # dropout

    layer_1_drop = tf.nn.dropout(layer_1, keep_prob)
    
    # Hidden layer with RELU activation
    layer_2 = tf.add(tf.matmul(layer_1_drop, weights['h2']), biases['b2'])
    layer_2 = tf.nn.relu(layer_2)
    
    # Output layer with linear activation
    out_layer = tf.matmul(layer_2, weights['out']) + biases['out']
    return out_layer

# 重みとバイアスの定義
weights = {
    'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes]))
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1])),
    'b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_classes]))
}

# 予測値の定義
pred = multilayer_perceptron(x, weights, biases)

# 損失関数とオプティマイザの定義
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

# 値の初期化
init = tf.initialize_all_variables()

# 学習実行
with tf.Session() as sess:
    sess.run(init)

    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.

        # Loop over step_size
        for i in range(step_size):
            # 訓練データから batch_size で指定した数をランダムに取得
            ind = np.random.choice(train_size, batch_size)
            x_train_batch = x_train[ind]
            y_train_batch = y_train[ind]
            x_train_batch
    # Run optimization op (backprop) and cost op (to get loss value)
            _, c = sess.run([optimizer, cost], feed_dict={x: x_train_batch, y: y_train_batch, keep_prob: 1.0})
            # Compute average loss
            avg_cost += c / step_size
        # Display logs per epoch step
        if epoch % display_step == 0:
            print("Epoch:", '%04d' % (epoch+1), "cost=", \
                "{:.9f}".format(avg_cost))
    print("Optimization Finished!")

    # モデルの評価
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
    # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    print("Accuracy:", accuracy.eval({x: x_test, y: y_test, keep_prob: 1.0}))

Epoch: 0001 cost= 50.103320393
Epoch: 0002 cost= 4.729443583
Epoch: 0003 cost= 4.216191870
Epoch: 0004 cost= 3.938698293
Epoch: 0005 cost= 4.128632157
Epoch: 0006 cost= 4.122880921
Epoch: 0007 cost= 3.837393814
Epoch: 0008 cost= 3.330383232
Epoch: 0009 cost= 3.865558344
Epoch: 0010 cost= 3.392739815
Optimization Finished!
Accuracy: 0.759336


# 年齢も予測する

In [16]:
import random
import math
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing

# Parameters
learning_rate = 0.001 # 学習率 高いとcostの収束が早まる
training_epochs = 10 # 学習全体をこのエポック数で区切り、区切りごとにcostを表示する
batch_size = 100     # 学習1回ごと( sess.run()ごと )に訓練データをいくつ利用するか
display_step = 1     # 1なら毎エポックごとにcostを表示
train_size = 650     # 全データの中でいくつ訓練データに回すか
step_size = 2000     # 何ステップ学習するか

# Network Parameters
n_hidden_1 = 64      # 隠れ層1のユニットの数
n_hidden_2 = 64      # 隠れ層2のユニットの数
n_input = 8          # 与える変数の数
n_classes = 2        # 分類するクラスの数 今回は生き残ったか否かなので2

data_dir = "/home/chie8842/share/titanic"
train_file = data_dir + '/train.csv'
test_file = data_dir + '/test.csv'

# データの読み込み
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)

# 値の欠落を埋める
train["Fare"].fillna(train.Age.median(), inplace=True) 
train.Cabin.fillna("NaN", inplace=True)
train.Embarked.fillna("NaN", inplace=True)

# ラベルエンコーダの宣言
le_sex_tr = preprocessing.LabelEncoder()
le_cabin_tr = preprocessing.LabelEncoder()
le_embarked_tr = preprocessing.LabelEncoder()

# ラベルエンコーディング
train['Sex'] = le_sex_tr.fit_transform(train['Sex'])
train['Cabin'] = le_cabin_tr.fit_transform(train.Cabin)
train['Embarked'] = le_embarked_tr.fit_transform(train.Embarked)

train_dropnaage = train.dropna(subset=['Age'])
#[['Age', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']]

train_nanage = train.query('(Age != Age)')

model = linear_model.LinearRegression(fit_intercept=True, normalize=False, copy_X=True, n_jobs=1)
lr_x = train_dropnaage.values[:,6].reshape(-1,1)
lr_y = train_dropnaage.values[:,5].astype(int).reshape(-1,1)
#train_dropnaage.dtypes
#lr_y
output = model.fit(lr_x,lr_y)

lr_t_x = train_nanage.values[:,6].reshape(-1,1)
lr_t_y = model.predict(lr_t_x)

#output





ValueError: Found input variables with inconsistent numbers of samples: [1, 714]

In [None]:
age_mean = train['Age'].mean()
n = np.nan
train_AgeNa = train.query('(Age != Age)')
train_AgeNa.fillna(math.floor(age_mean), inplace=True)


train = train.dropna(subset=['Age'])

# 特徴量データをnp.arrayに整形
x_np = train[['Age', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']].values
x_np_agena = train_AgeNa[['Age', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']].values

# クラスデータをOneHotエンコーディングしてnp.arrayに整形
oe = preprocessing.OneHotEncoder(sparse=False)
y_np = oe.fit_transform(train[['Survived']])
y_np_agena = oe.transform(train_AgeNa[['Survived']])

# 訓練データと試験データに分ける
[x_train, x_test] = np.vsplit(x_np, [train_size])
[y_train, y_test] = np.vsplit(y_np, [train_size])

x_test = np.concatenate((x_test, x_np_agena), axis=0)
y_test = np.concatenate((y_test, y_np_agena), axis=0)


In [35]:
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing

# Parameters
learning_rate = 0.001 # 学習率 高いとcostの収束が早まる
training_epochs = 10 # 学習全体をこのエポック数で区切り、区切りごとにcostを表示する
batch_size = 100     # 学習1回ごと( sess.run()ごと )に訓練データをいくつ利用するか
display_step = 1     # 1なら毎エポックごとにcostを表示
train_size = 800     # 全データの中でいくつ訓練データに回すか
step_size = 2000     # 何ステップ学習するか

# Network Parameters
n_hidden_1 = 64      # 隠れ層1のユニットの数
n_hidden_2 = 64      # 隠れ層2のユニットの数
n_input = 8          # 与える変数の数
n_classes = 2        # 分類するクラスの数 今回は生き残ったか否かなので2



data_dir = "/home/chie8842/share/titanic"
train_file = data_dir + '/train.csv'
test_file = data_dir + '/test.csv'

# データの読み込み
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)

#org_test.dropna(subset=['Fare']).count()
# 値の欠落を埋める
train.where(train.Sex == "male").Age.min() # 0.41999999999999998 29 80
#train.where(train.Sex == "female").Age.min() #0.75 27 63
#train.Age.fillna(train.Age.median(), inplace=True) 

0.41999999999999998

In [32]:
# Parameters
learning_rate = 0.001 # 学習率 高いとcostの収束が早まる
training_epochs = 10 # 学習全体をこのエポック数で区切り、区切りごとにcostを表示する
batch_size = 100     # 学習1回ごと( sess.run()ごと )に訓練データをいくつ利用するか
display_step = 1     # 1なら毎エポックごとにcostを表示
train_size = 800     # 全データの中でいくつ訓練データに回すか
step_size = 2000     # 何ステップ学習するか

# Network Parameters
n_hidden_1 = 64      # 隠れ層1のユニットの数
n_hidden_2 = 64      # 隠れ層2のユニットの数
n_input = 8          # 与える変数の数
n_classes = 2        # 分類するクラスの数 今回は生き残ったか否かなので2

Epoch: 0001 cost= 30.966967532
Epoch: 0002 cost= 2.927594726
Epoch: 0003 cost= 2.366886817
Epoch: 0004 cost= 2.446229219
Epoch: 0005 cost= 2.376396438
Epoch: 0006 cost= 2.267190577
Epoch: 0007 cost= 1.888190902
Epoch: 0008 cost= 2.132742506
Epoch: 0009 cost= 1.851374626
Epoch: 0010 cost= 1.856667134
Optimization Finished!
Accuracy: 0.835165


# Parameters
learning_rate = 0.01 # 学習率 高いとcostの収束が早まる
training_epochs = 10 # 学習全体をこのエポック数で区切り、区切りごとにcostを表示する
batch_size = 100     # 学習1回ごと( sess.run()ごと )に訓練データをいくつ利用するか
display_step = 1     # 1なら毎エポックごとにcostを表示
train_size = 800     # 全データの中でいくつ訓練データに回すか
step_size = 1000     # 何ステップ学習するか

# Network Parameters
n_hidden_1 = 64      # 隠れ層1のユニットの数
n_hidden_2 = 64      # 隠れ層2のユニットの数
n_input = 8          # 与える変数の数
n_classes = 2        # 分類するクラスの数 今回は生き残ったか否かなので2
Epoch: 0001 cost= 57.438435503
Epoch: 0002 cost= 38.556251650
Epoch: 0003 cost= 27.079147286
Epoch: 0004 cost= 22.003307903
Epoch: 0005 cost= 17.725240074
Epoch: 0006 cost= 18.246781544
Epoch: 0007 cost= 10.827384042
Epoch: 0008 cost= 10.692609782
Epoch: 0009 cost= 8.878264447
Epoch: 0010 cost= 8.329427984
Optimization Finished!
Accuracy: 0.824176

Epoch: 0001 cost= 57.438435503
Epoch: 0002 cost= 38.556251650
Epoch: 0003 cost= 27.079147286
Epoch: 0004 cost= 22.003307903
Epoch: 0005 cost= 17.725240074
Epoch: 0006 cost= 18.246781544
Epoch: 0007 cost= 10.827384042
Epoch: 0008 cost= 10.692609782
Epoch: 0009 cost= 8.878264447
Epoch: 0010 cost= 8.329427984
Optimization Finished!
Accuracy: 0.824176


In [None]:
# Parameters
learning_rate = 0.001 # 学習率 高いとcostの収束が早まる
training_epochs = 10 # 学習全体をこのエポック数で区切り、区切りごとにcostを表示する
batch_size = 100     # 学習1回ごと( sess.run()ごと )に訓練データをいくつ利用するか
display_step = 1     # 1なら毎エポックごとにcostを表示
train_size = 800     # 全データの中でいくつ訓練データに回すか
step_size = 2000     # 何ステップ学習するか

# Network Parameters
n_hidden_1 = 64      # 隠れ層1のユニットの数
n_hidden_2 = 64      # 隠れ層2のユニットの数
n_input = 8          # 与える変数の数
n_classes = 2        # 分類するクラスの数 今回は生き残ったか否かなので2

Adagrad

Epoch: 0001 cost= 464.975965088
Epoch: 0002 cost= 86.857236901
Epoch: 0003 cost= 44.344178129
Epoch: 0004 cost= 36.259928568
Epoch: 0005 cost= 31.819526917
Epoch: 0006 cost= 28.236630183
Epoch: 0007 cost= 24.917934471
Epoch: 0008 cost= 22.810194013
Epoch: 0009 cost= 21.187019714
Epoch: 0010 cost= 19.626279968
Optimization Finished!
Accuracy: 0.714286