[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wZHS9UX-xxeVFK-SbdtkiGH8pjNOHYcf?usp=sharing)

The goal is to predict whether or not a passenger survived based on attributes such as their age, gender, passenger class, where they embarked and so on.

In [None]:
import os
import urllib.request

TITANIC_PATH = os.path.join("datasets", "titanic")
DOWNLOAD_URL = "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/titanic/"

def fetch_titanic_data(url=DOWNLOAD_URL, path=TITANIC_PATH):
    if not os.path.isdir(path):
        os.makedirs(path)
    for filename in ("train.csv", "test.csv"):
        filepath = os.path.join(path, filename)
        if not os.path.isfile(filepath):
            print("Downloading", filename)
            urllib.request.urlretrieve(url + filename, filepath)

fetch_titanic_data()    

In [None]:
import pandas as pd

def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)

In [None]:
train_data = load_titanic_data("train.csv")
test_data = load_titanic_data("test.csv")

In [None]:
train_data.head()

In [None]:
test_data.head()

Let's explicitly set the PassengerId column as the index column:



In [None]:
train_data = train_data.set_index("PassengerId")
test_data = test_data.set_index("PassengerId")

In [None]:
train_data

In [None]:
train_data.info()


*Age*, *Cabin* and *Embarked* attributes are sometimes null (less than 891 non-null). Thus, we ignore the *Cabin* for now and focus on the rest. The *Age* attribute has about 19% null values, so we will need to decide what to do with them. Replacing null values with the median age seems reasonable. 

The *Name* and *Ticket* attributes may have some value, but they will be a bit tricky to convert into useful numbers that a model can consume. So for now, we will ignore them.

In [None]:
train_data["Survived"].value_counts()

In [None]:
train_data["Survived"].value_counts(normalize=True)

In [None]:
train_data["Pclass"].value_counts()


In [None]:
train_data["Sex"].value_counts()


In [None]:
train_data["Embarked"].value_counts()


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

In [None]:
from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("cat_encoder", OneHotEncoder(sparse=False)),
    ])

In [None]:
# This estimator allows different columns or column subsets of the input to be transformed separatel
from sklearn.compose import ColumnTransformer 

num_attribs = ["Age", "SibSp", "Parch", "Fare"]
cat_attribs = ["Pclass", "Sex", "Embarked"]

preprocess_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ])

In [None]:
X_train = preprocess_pipeline.fit_transform(
    train_data[num_attribs + cat_attribs])
X_train

In [None]:
y_train = train_data["Survived"]

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_clf.fit(X_train, y_train)

In [None]:
# Make predictions
X_test = preprocess_pipeline.transform(test_data[num_attribs + cat_attribs])
y_pred = forest_clf.predict(X_test)
print(y_pred)

In [None]:
from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean(), forest_scores.std()

In [None]:
from sklearn.svm import SVC

svm_clf = SVC(gamma=2)
svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=10)
svm_scores.mean(), svm_scores.std()

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# Exhaustive search over specified parameter values for an estimator.

parameters = {'kernel':('linear', 'rbf', 'poly'),
              'C':[.1, 1, 10, 100]
            }
clf = GridSearchCV(SVC(), parameters, cv=10)
clf.fit(X_train, y_train)

In [None]:
clf.best_params_

In [None]:
clf.best_score_

In [None]:
clf.cv_results_['mean_test_score']

In [None]:
sorted(clf.cv_results_.keys())