
ColaboratoryでKaggleを始める


In [0]:
!pip install kaggle

In [0]:
from googleapiclient.discovery import build
import io, os
from googleapiclient.http import MediaIoBaseDownload
from google.colab import auth

auth.authenticate_user()

drive_service = build('drive', 'v3')
results = drive_service.files().list(
        q="name = 'kaggle.json'", fields="files(id)").execute()
kaggle_api_key = results.get('files', [])

filename = "/root/.kaggle/kaggle.json"
os.makedirs(os.path.dirname(filename), exist_ok=True)

request = drive_service.files().get_media(fileId=kaggle_api_key[0]['id'])
fh = io.FileIO(filename, 'wb')
downloader = MediaIoBaseDownload(fh, request)
done = False
while done is False:
    status, done = downloader.next_chunk()
    print("Download %d%%." % int(status.progress() * 100))
os.chmod(filename, 600)




In [0]:
!kaggle competitions list

In [0]:
!kaggle competitions download -c titanic

In [0]:
import numpy as np
import pandas as pd
import tensorflow as tf
import time

def parse(df):
    # Nan を -1 に置換
    df["Age"] = df["Age"].fillna(-1)
    df["Embarked"] = df["Embarked"].fillna(-1)
    # 文字列を数値に変換
    df["Sex"][df["Sex"] == "male"] = 0
    df["Sex"][df["Sex"] == "female"] = 1
    df["Embarked"][df["Embarked"] == "S" ] = 0
    df["Embarked"][df["Embarked"] == "C" ] = 1
    df["Embarked"][df["Embarked"] == "Q"] = 2
    return df

def split_val(x,y,rate,seed=None):
    # x,y の rate[%] をランダム分割
    N = x.shape[0]
    val_num = int(N*rate)
    if seed is not None:
        np.random.seed(seed)
    perm = np.random.permutation(N)
    ti = perm[:-val_num]
    vi = perm[-val_num:]
    return x[ti],y[ti],x[vi],y[vi]

def load_data():
    train_csv = pd.read_csv('train.csv')
    train_csv = parse(train_csv)
    train_x = train_csv[['Pclass', 'Sex', 'Fare','SibSp', 'Parch', 'Age', 'Embarked']].values
    train_y = train_csv['Survived'].values
    return split_val(train_x,train_y,0.1)

In [0]:
def create_model(input_placeholder,u_dim,layer_num,y_dim,training):
    h = input_placeholder
    for i in range(layer_num):
        h = tf.layers.dense(inputs=h, units=u_dim,activation=tf.nn.relu)
        h = tf.layers.batch_normalization(h,training=training)  
    h = tf.layers.dense(inputs=h, units=y_dim)
    return h


In [0]:
np.random.seed(0)
tf.reset_default_graph()
y_dim     = 2
# hyperparameters
u_dim     = 100
layer_num = 2
epoch     = 1000
log_freq  = 100
batchsize = 200
lr        = 0.001

# data load
train_x,train_y,val_x,val_y = load_data()
N  = train_y.shape[0]
Nv = val_y.shape[0]
print('train num: {}, val num: {}'.format(N,Nv))

# calculation graph
x = tf.placeholder(tf.float32, [None, train_x.shape[1]],"input")
y = tf.placeholder(tf.int32, [None])

with tf.variable_scope("model"):
  train_z = create_model(x,u_dim,layer_num,y_dim,training=True)
with tf.variable_scope("model", reuse=True):
  z       = create_model(x,u_dim,layer_num,y_dim,training=False)

cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y, logits=train_z)

extra_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(extra_ops):
  train_step = tf.train.AdamOptimizer(lr).minimize(cross_entropy)

pred_y = tf.cast(tf.argmax(z, 1), tf.int32)
accuracy = tf.reduce_mean(tf.cast(tf.equal(pred_y, y), tf.float32))

# main loop
sess = tf.InteractiveSession()
tf.global_variables_initializer().run()
T = time.time() 
for ep in range(1,epoch+1) :
  perm=np.random.permutation(N)
  for i in range(0,N,batchsize):
    batch_xs=train_x[perm[i:i+batchsize]]
    batch_ys=train_y[perm[i:i+batchsize]]
    sess.run(train_step, feed_dict={x: batch_xs, y: batch_ys})
  # monitor
  if ep%log_freq ==0 :      
    train_loss, train_acc = sess.run([cross_entropy, accuracy], feed_dict={x: train_x, y: train_y})
    val_loss, val_acc     = sess.run([cross_entropy, accuracy], feed_dict={x: val_x, y: val_y})
    epochT = time.time()-T
    print('Epoch: %d, Time :%.4f (s), train_loss: %f,  train_acc: %f, val_loss: %f,  val_acc: %f' % (ep, epochT, train_loss, train_acc, val_loss, val_acc))
    T = time.time()




In [0]:
# data
test_csv = pd.read_csv('test.csv')
test_csv = parse(test_csv)
test_x = test_csv[['Pclass', 'Sex', 'Fare','SibSp', 'Parch', 'Age', 'Embarked']].values
# predict
prediction = sess.run(pred_y, feed_dict={x: test_x})
# parse
PassengerId = np.array(test_csv["PassengerId"]).astype(int)
my_solution = pd.DataFrame(prediction, PassengerId, columns = ["Survived"])
# save
my_solution.to_csv("result.csv", index_label = ["PassengerId"])



In [0]:
!cat result.csv

In [0]:
!kaggle competitions submit -c titanic -f result.csv -m 'first submit'