# Titanic - Machine Learning from Disaster

## Kaggle API setting
始めに
- kaggle.json (API Token)をGoogle Drive下Colab Notebooksフォルダに配置
- Google Driveをマウント

In [1]:
!pip install kaggle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import os
import json
f = open("/content/drive/MyDrive/Colab Notebooks/kaggle.json", 'r')
json_data = json.load(f) 
os.environ['KAGGLE_USERNAME'] = json_data['username']
os.environ['KAGGLE_KEY'] = json_data['key']

In [3]:
!kaggle competitions download -c titanic

titanic.zip: Skipping, found more recently modified local copy (use --force to force download)


In [4]:
!unzip -o "/content/titanic.zip"

Archive:  /content/titanic.zip
  inflating: gender_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


## preprocess

In [5]:
%%bash
pip install polars

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
import polars as pl
import pandas as pd

df_train = pl.read_csv("train.csv")
df_test = pl.read_csv("test.csv")
print(f"train shape: {df_train.shape}, test shape: {df_test.shape}")

df_train.head()

train shape: (891, 12), test shape: (418, 11)


PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Ow...","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. ...","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Mis...","""female""",26.0,0,0,"""STON/O2. 31012...",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs....","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. Wil...","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


In [7]:
df_train.describe()

describe,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
str,f64,f64,f64,str,str,f64,f64,f64,str,f64,str,str
"""count""",891.0,891.0,891.0,"""891""","""891""",891.0,891.0,891.0,"""891""",891.0,"""891""","""891"""
"""null_count""",0.0,0.0,0.0,"""0""","""0""",177.0,0.0,0.0,"""0""",0.0,"""687""","""2"""
"""mean""",446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
"""std""",257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
"""min""",1.0,0.0,1.0,"""Abbing, Mr. An...","""female""",0.42,0.0,0.0,"""110152""",0.0,"""A10""","""C"""
"""max""",891.0,1.0,3.0,"""van Melkebeke,...","""male""",80.0,8.0,6.0,"""WE/P 5735""",512.3292,"""T""","""S"""
"""median""",446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,


In [8]:
# Embarked unique
print(df_train["Embarked"].value_counts(), "\n")
print(df_test["Embarked"].value_counts(), "\n")

# 最多数のSで欠損値を埋める
_tmp = df_train["Embarked"].fill_null("S")
df_train = df_train.with_column(_tmp)
print(df_train["Embarked"].value_counts(), "\n")

df_train.head()

shape: (4, 2)
┌──────────┬────────┐
│ Embarked ┆ counts │
│ ---      ┆ ---    │
│ str      ┆ u32    │
╞══════════╪════════╡
│ null     ┆ 2      │
│ S        ┆ 644    │
│ Q        ┆ 77     │
│ C        ┆ 168    │
└──────────┴────────┘ 

shape: (3, 2)
┌──────────┬────────┐
│ Embarked ┆ counts │
│ ---      ┆ ---    │
│ str      ┆ u32    │
╞══════════╪════════╡
│ C        ┆ 102    │
│ S        ┆ 270    │
│ Q        ┆ 46     │
└──────────┴────────┘ 

shape: (3, 2)
┌──────────┬────────┐
│ Embarked ┆ counts │
│ ---      ┆ ---    │
│ str      ┆ u32    │
╞══════════╪════════╡
│ C        ┆ 168    │
│ S        ┆ 646    │
│ Q        ┆ 77     │
└──────────┴────────┘ 



PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Ow...","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. ...","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Mis...","""female""",26.0,0,0,"""STON/O2. 31012...",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs....","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. Wil...","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


In [9]:
# N of Fimily
df_train = df_train.with_column(pl.Series("N_of_Falmily", df_train['SibSp'] + df_train['Parch']))
df_test = df_test.with_column(pl.Series("N_of_Falmily", df_test['SibSp'] + df_test['Parch']))

In [10]:
# convert to dummy variable
def df_to_dummy(df):
  df = df.with_column(df["Pclass"].cast(str))
  _tmp = pd.get_dummies(df[['Sex', 'Embarked', 'Pclass']].to_pandas(), drop_first = False)
  
  df = pl.concat([df, pl.DataFrame(_tmp)], how="horizontal")
  return df
  
df_train = df_to_dummy(df_train)
df_test = df_to_dummy(df_test)

In [11]:
# select feature
y_train = df_train["Survived"].to_numpy()
X_train = df_train.drop(columns=['PassengerId', 'Name', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Embarked', 'Age', 'Cabin', 'Pclass', 'Survived']).to_numpy()

X_test = df_test.drop(columns=['PassengerId', 'Name', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Embarked', 'Age', 'Cabin', 'Pclass']).to_numpy()

## train

In [12]:
from xgboost import XGBClassifier

print(X_train.shape, X_test.shape)

xgb = XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=100)
xgb.fit(X_train, y_train)
y_test = xgb.predict(X_test)

(891, 10) (418, 10)


In [13]:
from datetime import datetime, timedelta, timezone
JST = timezone(timedelta(hours=+9), 'JST')
t = datetime.now(JST).strftime('%Y%m%d_%H%M')

df_submit = df_test[["PassengerId"]]
df_submit = df_submit.with_column(pl.Series("Survived", y_test))
df_submit.to_pandas().to_csv((f'submit_{t}.csv'), index=False)