In [1]:
import pandas as pd, numpy as np, sklearn, seaborn
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [64]:
# Load data
data = pd.read_csv('train.csv')

# Drop irrelevant columns. Cabin may not be fully irrelevant, as it may give hint as to the passenger's location
# when the accident occurred, I may try to do more with it later.
data_noc = data.drop(columns=['Cabin','Name', 'Ticket','PassengerId'])

y = data_noc['Survived']
X = data_noc.drop(columns = ['Survived'])
cols = X.columns
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

In [8]:
imputer = KNNImputer(n_neighbors=5, weights="uniform")

pipeline_1 = Pipeline([('Impute', imputer), ('Scaler', StandardScaler())]) # Numerical
pipeline_2 = Pipeline([('Encoder', OneHotEncoder())]) # Categorical

preprocess = ColumnTransformer(
    [('Numerical', pipeline_1, ['Age','SibSp','Parch','Fare']),
     ('Categorical', pipeline_2, ['Pclass','Sex','Embarked'])])

In [6]:
clf = Pipeline(
    steps=[("preprocessor", preprocess), ("classifier", LogisticRegression())]
)
clf.fit(X_train, y_train);


In [19]:
# Feature importance check

forest = RandomForestClassifier(random_state = 0)

forest_pipe = Pipeline(
    steps=[("preprocessor", preprocess), ("classifier", forest)]
)
forest_pipe.fit(X_train, y_train)
forest.feature_importances_

array([2.66015710e-01, 4.40905267e-02, 3.66737930e-02, 2.41482438e-01,
       2.44064527e-02, 1.47139637e-02, 4.99839774e-02, 1.13721257e-01,
       1.72688176e-01, 1.12694437e-02, 7.55994775e-03, 1.73415863e-02,
       5.27282328e-05])

In [65]:
print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.833


In [80]:
# XGBoost test
xgb = XGBClassifier(n_estimators=4, max_depth=10, learning_rate=0.4, objective='binary:logistic')

clf = Pipeline(
    steps=[("preprocessor", preprocess), ("classifier", xgb)]
)
clf.fit(X_train, y_train);
print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.856


In [81]:
test_dat = pd.read_csv('test.csv')
pred_dat = pd.DataFrame(clf.predict(test_dat).reshape(-1,1), columns = ['Survived'])
temp = pd.DataFrame(test_dat.PassengerId)
pred_dat = temp.join(pred_dat)

In [82]:
pred_dat.to_csv('pred.csv', index = False, index_label = False)

In [57]:
print(pred_dat)

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         0
4            896         0
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         0

[418 rows x 2 columns]
