In [1]:
import numpy as np
import pandas as pd

train = pd.read_csv('./input/titanic/train.csv')
train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
print(train.dtypes)

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


In [3]:
def transform1(df):
    X_1 = pd.DataFrame()
    # for colname in train.select_dtypes(["category", "object"]):
    for colname in ['Sex', 'Ticket', 'Cabin', 'Embarked']:
        X_1[colname], _ = df[colname].factorize()
    return X_1

X_1 = transform1(train)
X_1

Unnamed: 0,Sex,Ticket,Cabin,Embarked
0,0,0,-1,0
1,1,1,0,1
2,1,2,-1,0
3,1,3,1,0
4,0,4,-1,0
...,...,...,...,...
886,0,677,-1,0
887,1,678,145,0
888,1,614,-1,0
889,0,679,146,1


In [4]:
def transform2(df):
    X_2 = pd.DataFrame()
    # X_2 = train.select_dtypes(['int64'])
    X_2["SibSp"] = df.SibSp
    X_2["Parch"] = df.Parch
    X_2["CompanyCount"] = df.SibSp + df.Parch
    X_2["Pclass"] = df.Pclass
    
    return X_2

X_2 = transform2(train)
X_2

Unnamed: 0,SibSp,Parch,CompanyCount,Pclass
0,1,0,1,3
1,1,0,1,1
2,0,0,0,3
3,1,0,1,1
4,0,0,0,3
...,...,...,...,...
886,0,0,0,2
887,0,0,0,1
888,1,2,3,3
889,0,0,0,1


In [5]:
def transform3(df):
    X_3 = pd.DataFrame()
    
    X_3 = df.select_dtypes(['float64'])
    # print(X_3.isnull().sum())
    # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.fillna.html
    # X_3.fillna(0, inplace = True)
    return (X_3)

X_3 = transform3(train)
X_3

Unnamed: 0,Age,Fare
0,22.0,7.2500
1,38.0,71.2833
2,26.0,7.9250
3,35.0,53.1000
4,35.0,8.0500
...,...,...
886,27.0,13.0000
887,19.0,30.0000
888,,23.4500
889,26.0,30.0000


In [6]:
def transform4(df):
    X_4 = pd.DataFrame()
    
    X_4["FirstName"] = df.Name.str.split(",", n = 1, expand = True)[0]
    X_4["FirstName"], _ = X_4["FirstName"].factorize()
    return X_4

X_4 = transform4(train)
X_4

Unnamed: 0,FirstName
0,0
1,1
2,2
3,3
4,4
...,...
886,664
887,233
888,604
889,665


In [7]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier

def score_dataset(X, y, 
                  model = XGBClassifier(use_label_encoder = False,
                                        eval_metric='mlogloss')):
    # https://scikit-learn.org/stable/modules/model_evaluation.html
    score = cross_val_score(
        model, X, y, cv = 5, scoring = "accuracy"  
    )
    return score.mean()

newX = pd.concat([transform1(train), transform2(train), transform3(train), transform4(train)], axis = 1)
score_dataset(newX, train.Survived)    

0.814889209716904

In [8]:
model = XGBClassifier(use_label_encoder = False, eval_metric='mlogloss')
model.fit(pd.concat([transform1(train), transform2(train), transform3(train), transform4(train)], axis = 1), train.Survived)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='mlogloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=16,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, verbosity=None)

In [9]:
test = pd.read_csv('./input/titanic/test.csv')
testX = pd.concat([transform1(test), transform2(test), transform3(test), transform4(test)], axis = 1)
predicted = model.predict(testX)

In [10]:
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived':  predicted})
print(output.head())

output.to_csv('feature_engineered.csv', index=False)

   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         0
4          896         1
