In [1]:
import numpy as np
import pandas as pd

## Load Data

In [2]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

In [3]:
train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
test_data.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [5]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
test_data.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [7]:
train_data['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [8]:
train_data['Ticket'].value_counts().head(10)

347082          7
1601            7
CA. 2343        7
CA 2144         6
3101295         6
347088          6
382652          5
S.O.C. 14879    5
113781          4
PC 17757        4
Name: Ticket, dtype: int64

In [9]:
train_data['Cabin'].value_counts().head(5)

C23 C25 C27    4
G6             4
B96 B98        4
F2             3
D              3
Name: Cabin, dtype: int64

## Feature Engineering

In [10]:
from sklearn.preprocessing import Imputer


def nan_padding(data, columns):
    for column in columns:
        imputer = Imputer()
        data[column] = imputer.fit_transform(data[column].values.reshape(-1,1))
    return data


nan_columns = ["Age", "SibSp", "Parch", "Fare"]

train_data = nan_padding(train_data, nan_columns)
test_data = nan_padding(test_data, nan_columns)

In [11]:
# 사용하지 않을 컬럼을 제거해 준다.
def drop_not_concerned(data, columns):
    return data.drop(columns, axis=1)

not_concerned_columns = ["Name", "Ticket", "Cabin"]
train_data = drop_not_concerned(train_data, not_concerned_columns)
test_data = drop_not_concerned(test_data, not_concerned_columns)

In [12]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1.0,0.0,7.25,S
1,2,1,1,female,38.0,1.0,0.0,71.2833,C
2,3,1,3,female,26.0,0.0,0.0,7.925,S
3,4,1,1,female,35.0,1.0,0.0,53.1,S
4,5,0,3,male,35.0,0.0,0.0,8.05,S


In [13]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0.0,0.0,7.8292,Q
1,893,3,female,47.0,1.0,0.0,7.0,S
2,894,2,male,62.0,0.0,0.0,9.6875,Q
3,895,3,male,27.0,0.0,0.0,8.6625,S
4,896,3,female,22.0,1.0,1.0,12.2875,S


In [14]:
#  one-hot-encoding
def dummy_data(data, columns):
    for column in columns:
        data = pd.concat([data, pd.get_dummies(data[column], prefix = column)], axis=1)
        data = data.drop(column, axis=1)
    return data


dummy_columns = ["Pclass", "Embarked"]
train_data = dummy_data(train_data, dummy_columns)
test_data = dummy_data(test_data, dummy_columns)

In [15]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Sex,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
0,1,0,male,22.0,1.0,0.0,7.25,0,0,1,0,0,1
1,2,1,female,38.0,1.0,0.0,71.2833,1,0,0,1,0,0
2,3,1,female,26.0,0.0,0.0,7.925,0,0,1,0,0,1
3,4,1,female,35.0,1.0,0.0,53.1,1,0,0,0,0,1
4,5,0,male,35.0,0.0,0.0,8.05,0,0,1,0,0,1


In [16]:
from sklearn.preprocessing import LabelEncoder

# 성별을 0과 1로 인코딩
def sex_to_int(data):
    le = LabelEncoder()
    le.fit(["male","female"])
    data["Sex"] = le.transform(data["Sex"]) 
    return data

train_data = sex_to_int(train_data)
test_data = sex_to_int(test_data)
train_data.head()

Unnamed: 0,PassengerId,Survived,Sex,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
0,1,0,1,22.0,1.0,0.0,7.25,0,0,1,0,0,1
1,2,1,0,38.0,1.0,0.0,71.2833,1,0,0,1,0,0
2,3,1,0,26.0,0.0,0.0,7.925,0,0,1,0,0,1
3,4,1,0,35.0,1.0,0.0,53.1,1,0,0,0,0,1
4,5,0,1,35.0,0.0,0.0,8.05,0,0,1,0,0,1


In [17]:
from sklearn.preprocessing import MinMaxScaler

def normalize_value(data):
    scaler = MinMaxScaler()
    data["Age"] = scaler.fit_transform(data["Age"].values.reshape(-1,1))
#     data["Fare"] = scaler.fit_transform(data["Fare"].values.reshape(-1,1))
    return data

train_data = normalize_value(train_data)
test_data = normalize_value(test_data)

train_data.head()

Unnamed: 0,PassengerId,Survived,Sex,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
0,1,0,1,0.271174,1.0,0.0,0.014151,0,0,1,0,0,1
1,2,1,0,0.472229,1.0,0.0,0.139136,1,0,0,1,0,0
2,3,1,0,0.321438,0.0,0.0,0.015469,0,0,1,0,0,1
3,4,1,0,0.434531,1.0,0.0,0.103644,1,0,0,0,0,1
4,5,0,1,0.434531,0.0,0.0,0.015713,0,0,1,0,0,1


In [29]:
feature_columns = ['Sex', 'Age', 'SibSp', 'Parch', 'Pclass_1', 'Pclass_2',
       'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S']

In [30]:
X_train = train_data[feature_columns]
X_test = test_data[feature_columns]
y_label = train_data['Survived']

In [31]:
X_train.columns

Index(['Sex', 'Age', 'SibSp', 'Parch', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [32]:
print(X_train.shape)
print(X_test.shape)

(891, 10)
(418, 10)


# XGB

In [33]:
import warnings
warnings.filterwarnings('ignore')

from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV


gbm_param_grid = {
    'n_estimators': range(7, 30),
    'max_depth': range(5, 10),
    'learning_rate': [.4, .45, .5, .55, .6],
    'colsample_bytree': [.6, .7, .8, .9, 1]
}

gbm = XGBClassifier()

xgb_random = RandomizedSearchCV(param_distributions=gbm_param_grid, 
                                estimator = gbm, scoring = "accuracy", 
                                verbose = 1, n_iter = 50, cv = 4)


xgb_random.fit(X_train, y_label)

Fitting 4 folds for each of 50 candidates, totalling 200 fits


[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    2.7s finished


RandomizedSearchCV(cv=4, error_score='raise',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
          fit_params=None, iid=True, n_iter=50, n_jobs=1,
          param_distributions={'n_estimators': range(7, 30), 'max_depth': range(5, 10), 'learning_rate': [0.4, 0.45, 0.5, 0.55, 0.6], 'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='accuracy', verbose=1)

In [34]:
xgb_random.best_params_

{'colsample_bytree': 0.6,
 'learning_rate': 0.45,
 'max_depth': 6,
 'n_estimators': 10}

In [35]:
xgb_best_score = round(xgb_random.best_score_, 2)
xgb_best_score

0.83

In [36]:
xgb_predictions = xgb_random.predict(X_test)

In [37]:
test_data['Survived'] = xgb_predictions
xgb_submissions = test_data[['PassengerId', 'Survived']]
xgb_submissions.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [38]:
xgb_submissions.to_csv('submissions/submit_xgb_{}.csv'.format(xgb_best_score), index=False)