In [1]:
import pandas as pd

In [2]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
X_test = pd.read_csv("/kaggle/input/titanic/test.csv")

train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
# remove rows with missing y values
train_data.dropna(axis=0, subset=["Survived"], inplace=True)

y = train_data.Survived
X = train_data.drop(columns=["Survived"])

In [5]:
from sklearn.model_selection import train_test_split

# split data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [6]:
# view how many missing values each column has
missing_values = {}
for col in X.columns:
    missing_values[col] = X[col].isnull().sum()
sorted(missing_values.items(), key=lambda x:x[1], reverse=True)

[('Cabin', 687),
 ('Age', 177),
 ('Embarked', 2),
 ('PassengerId', 0),
 ('Pclass', 0),
 ('Name', 0),
 ('Sex', 0),
 ('SibSp', 0),
 ('Parch', 0),
 ('Ticket', 0),
 ('Fare', 0)]

In [7]:
# see which columns contain categorical data
categorical_cols = [col for col in X.columns if X[col].dtype == "object"]
for col in categorical_cols:
    print(f"{col} has {X[col].nunique()} unique values")

Name has 891 unique values
Sex has 2 unique values
Ticket has 681 unique values
Cabin has 147 unique values
Embarked has 3 unique values


In [8]:
# remove 'Name', 'Ticket' and 'Cabin' since these cannot be reasonably one-hot encoded
# 'Name' is unlikely to help predictions anyway
X_train = X_train.drop(columns=["Name", "Ticket", "Cabin"])
X_valid = X_valid.drop(columns=["Name", "Ticket", "Cabin"])
X_test = X_test.drop(columns=["Name", "Ticket", "Cabin"])
X = X.drop(columns=["Name", "Ticket", "Cabin"])

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# can reasonably impute 'Age' and 'Embarked' columns
# also need to impute 'Fare' column since X_test has missing values in that column
# create pipelines for categorical columns and numerical columns separately and then bundle these steps together
numerical_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]
numerical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy="median")), ('ss', StandardScaler())])

categorical_cols = [col for col in X.columns if X[col].dtype == "object"]
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy="most_frequent")), ('ohe', OneHotEncoder(handle_unknown="ignore"))])

preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_cols), ('cat', categorical_transformer, categorical_cols)])

In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# logistic regression
logistic_regression_model = LogisticRegression(random_state=0, max_iter=1000)
clf_logreg = Pipeline(steps=[('preprocessor', preprocessor), ('model', logistic_regression_model)])

logreg_params = {
    "model__C": [0.05, 0.1, 0.5, 1, 3, 10, 20, 30, 40],
    "model__solver": ["lbfgs", "liblinear"]
}

# optimise parameters
logreg_gs = GridSearchCV(clf_logreg, logreg_params, scoring="f1", cv=5)

logreg_gs.fit(X_train, y_train)
print(logreg_gs.best_params_)
print(logreg_gs.best_score_)

{'model__C': 0.1, 'model__solver': 'lbfgs'}
0.7237638001486429


In [11]:
from sklearn.ensemble import RandomForestClassifier

# random forest
random_forest_model = RandomForestClassifier(random_state=0)
clf_rf = Pipeline(steps=[('preprocessor', preprocessor), ('model', random_forest_model)])
    
rf_params = {
    'model__n_estimators': [10, 15, 20, 25, 30, 75, 100], 
    'model__max_depth': [1, 3, 5, 7, 10]
}    

# optimise paramaters
rf_gs = GridSearchCV(clf_rf, rf_params, scoring="f1", cv=8)

rf_gs.fit(X_train, y_train)
print(rf_gs.best_params_)
print(rf_gs.best_score_)

{'model__max_depth': 7, 'model__n_estimators': 25}
0.7489628249147875


In [12]:
from sklearn.svm import SVC

# support vector machine
svc_model = SVC(random_state=0)
clf_svc = Pipeline(steps=[('preprocessor', preprocessor), ('model', svc_model)])

svc_params = {
    "model__C": [1, 3, 10, 30, 100],
    "model__kernel": ["linear", "poly", "rbf" , "sigmoid"],
}

# optimise parameters
svc_gs = GridSearchCV(clf_svc, svc_params, scoring="f1", cv=5)

svc_gs.fit(X_train, y_train)
print(svc_gs.best_params_)
print(svc_gs.best_score_)

{'model__C': 1, 'model__kernel': 'rbf'}
0.7500642599142325


In [13]:
from xgboost import XGBClassifier

# extreme gradient boosting
xgboost_model = XGBClassifier(n_estimators=1000, learning_rate=0.01)
clf_xgb = Pipeline(steps=[('preprocessor', preprocessor), ('model', xgboost_model)])

xgb_params = {
    "model__n_estimators": [10, 50, 100, 250, 500, 750, 1000],
    "model__learning_rate": [0.001, 0.01, 0.05, 0.1]
}

# optimise parameters
xgb_gs = GridSearchCV(clf_xgb, xgb_params, scoring="f1", cv=5)

xgb_gs.fit(X_train, y_train)
print(xgb_gs.best_params_)
print(xgb_gs.best_score_)

{'model__learning_rate': 0.001, 'model__n_estimators': 750}
0.7431291885140643


In [14]:
from sklearn.ensemble import VotingClassifier

# create an ensemble to select the best estimator from each of the four methods
ensemble_model = VotingClassifier(estimators=[
    ("logreg", logreg_gs.best_estimator_),
    ("rf", rf_gs.best_estimator_),
    ("svc", svc_gs.best_estimator_),
    ("xgb", xgb_gs.best_estimator_)
], voting = "hard")

ensemble_model.fit(X, y)
ensemble_model.score(X, y)

0.8664421997755332

In [15]:
predictions = ensemble_model.predict(X_test)

In [16]:
output = pd.DataFrame({"PassengerId": X_test.PassengerId, "Survived": predictions})
output.to_csv("my_submission_3.csv", index=False)