In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_validate
from sklearn import svm
from sklearn.impute import KNNImputer

In [6]:
train_df = pd.read_csv("../data/titanic/train.csv")
test_df = pd.read_csv("../data/titanic/test.csv")

In [7]:
# Make int rather than string
train_df["Sex"] = train_df["Sex"].map({"female": 1, "male": 0}).astype(int)
test_df["Sex"] = test_df["Sex"].map({"female": 1, "male": 0}).astype(int)

# Fill with most common port
train_df["Embarked"].fillna('S', inplace=True)
test_df["Embarked"].fillna('S', inplace=True)

# Make int rather than string
train_df["Embarked"] = train_df["Embarked"].map({'S': 0, 'Q': 1, 'C': 2}).astype(int)
test_df["Embarked"] = test_df["Embarked"].map({'S': 0, 'Q': 1, 'C': 2}).astype(int)

# Fill missing values with mode and bin
train_df["Fare"].fillna(train_df["Fare"].mode(), inplace=True)
test_df["Fare"].fillna(test_df["Fare"].mode(), inplace=True)
test_df["Fare"] = pd.cut(test_df["Fare"], bins=[-0.001, 7.91, 14.454, 31.0, 513], right=True, labels=[0, 1, 2, 3]).cat.codes
train_df["Fare"] = pd.cut(train_df["Fare"], bins=[-0.001, 7.91, 14.454, 31.0, 513], right=True, labels=[0, 1, 2, 3]).cat.codes

# Use KNN imputer to get missing age values
imputer = KNNImputer(n_neighbors=5)
X = train_df["Age"].values.reshape(-1, 1)
transform = imputer.fit_transform(X)
train_df["Age"] = transform

X = test_df["Age"].values.reshape(-1, 1)
transform = imputer.fit_transform(X)
test_df["Age"] = transform

X = train_df[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]].astype(float)
X_test = test_df[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]].astype(float)
y = train_df["Survived"].values
X.dtypes

Pclass      float64
Sex         float64
Age         float64
SibSp       float64
Parch       float64
Fare        float64
Embarked    float64
dtype: object

# Linear

In [19]:
clf = svm.SVC(kernel="linear")
results = cross_validate(clf, X, y, cv=5, scoring=["accuracy"])
results["test_accuracy"].mean()

0.7878601468834348

# RBF

In [18]:
clf = svm.SVC(kernel="rbf")
results = cross_validate(clf, X, y, cv=5, scoring=["accuracy"])
results["test_accuracy"].mean()

0.6419747661791476

# Polynomial Degree 2

In [17]:
clf = svm.SVC(kernel="poly", degree=2)
results = cross_validate(clf, X, y, cv=5, scoring=["accuracy"])
results["test_accuracy"].mean()

0.6386039796622937