In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split

titanic = pd.read_csv('titanic.csv')
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
# Split into train, validation, and test set
features = titanic.drop('Survived', axis=1)
labels = titanic['Survived']

# 60% training set 40% test set, 2 splits
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.4, random_state=42)
# split 40% test set into test and validation
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [12]:
print(len(labels), len(y_train), len(y_val), len(y_test))

891 534 179 178


In [13]:
# Holdout Test Set: sample of data not used in fitting a model; 
#used to evaluate the model's ability to generalize to unseen data

# K-Fold Cross-Validation: data is divided into k subsets and the holdout method
# is repeated k times. Each time, one of the k subsets is used as the test set and 
# the other k-1 subsets are combined to be used to train the model.

# Evaluation metrics
# Accuracy = number of predicted correctly / total number of examples
# precision = number of predicted as surviving that actually survived / total number predicted to survive
# recall = number predicted as surviving that actually survived / total number that actually survived

In [14]:
# Bias and variance
# variance refers to an algorithm's sensitivity to small fluctuations in the training set
# bias is the algorithm's tendency to consistently learn the wrong thing by not taking into 
# account all the information in the data
# Total error = (Bias + Variance) + Irreducible Error

In [15]:
# Underfitting
# occurs when an algorithm cannot capture the underlying trend of the data
# it happens when the model is too simple with high bias and low variance = high total error
# Overfitting
# occurs when the model has fit very closely to the training data and it is very likely that 
# this isn't the real pattern, the model just memorized these examples
# it happens when the model is too complex with low bias and high variance = high total error

In [16]:
# Tune Model for Optimal Complexity
# 1.hyperparameter tuning
# Hyperparameter is a configuration that is external to the model, whose value cannot be estimated
# from data, and whose value guides how the algorithm learns parameter values from the data.
# eg. max depth of tree, features to consider, etc.

# 2.regularization
# reduce overfitting by discouraging overly complex models in some way
# eg. 1.ridge regression and lasso regression -- adding a penalty to the loss function to constrain coefficients
# 2. dropout -- some nodes are ignored during training which forces the other nodes to take on more or less 
# responsibility for the input/output

# Occam's razor -- whenever possible, choose the simplest answer to a problem