In [1]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import accuracy_score

# Read data

In [2]:
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')
all_df = train_df.append(test_df)

all_df['is_test'] = all_df.Survived.isnull()
all_df.index = all_df.Survived
del all_df['Survived']

In [3]:
all_df.head()

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Ticket,is_test
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,22,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,A/5 21171,False
1,38,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,PC 17599,False
1,26,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,STON/O2. 3101282,False
1,35,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,113803,False
0,35,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,373450,False


# Target variable

In [4]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


Target variable is **Survived**.

# Quality metric

Your score is the percentage of passengers you correctly predict. That means - **accuracy**.

# Model

## One variable model
Let's build a very simple model, based on one variable.
That nobody will survived.

In [5]:
def select_features(df):
    non_obj_feats = df.columns[ df.dtypes != 'object' ]
    black_list = ['is_test']
    
    return [feat for feat in non_obj_feats if feat not in black_list ]

def get_X_y(df):
    feats = select_features(df)
    
    X = df[feats].values
    y = df.index.values.astype(int)
    
    return X, y

def check_quality(model, X, y, n_folds=5, random_state=0, shuffle=False):
    skf = StratifiedKFold(y, n_folds=n_folds, random_state=random_state, shuffle=shuffle)
    scores = []
    
    for train_index, test_index in skf:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        
        scores.append(score)
        
    return np.mean(scores), np.std(scores)

def train_and_verify(df, model):
    X, y = get_X_y( df[ df.is_test == False ] )
    return check_quality(model, X, y)

In [6]:
class SingleVariableModel(BaseEstimator, ClassifierMixin):
    def __init__(self, seed=1):
        np.random.seed(seed)

    def fit(self, X, y):
        return self
        
    def predict(self, X):
        
        
        return [0] * len(X)
    
    def __repr__(self):
        return 'SingleVariableModel'

## Run & evoluate single variable model

In [7]:
train_and_verify(all_df, SingleVariableModel())

(0.61616490890978648, 0.0015536004208290756)

**What do you think about this result?**

## Let's build more advanced model

### Missing values
There're several methods how to manage missing values, let's fill out -1.

In [8]:
all_df.fillna(-1, inplace=True)

In [9]:
train_and_verify(all_df, RandomForestClassifier())

(0.6836195074308804, 0.045102412780797671)

The result looks better than previous one (**0.616** vs **0.683**).  
Let's improve it... by using those features `['Cabin', 'Embarked', 'Name', 'Sex', 'Ticket']`

In [10]:
if 'sex_male' not in all_df:
    one_hot = pd.get_dummies( all_df.Sex, prefix='sex')
    all_df = pd.concat( [all_df, one_hot], axis=1 )

In [11]:
train_and_verify(all_df, RandomForestClassifier())

(0.77233454158062287, 0.081835175672608979)

The result looks better than previous one (**0.683** vs **0.772**).  

Next features wchich could improve quality the model

In [17]:
all_df[ ['Name', 'Cabin', 'Embarked', 'Ticket'] ].head()

Unnamed: 0_level_0,Name,Cabin,Embarked,Ticket
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,"Braund, Mr. Owen Harris",-1,S,A/5 21171
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",C85,C,PC 17599
1,"Heikkinen, Miss. Laina",-1,S,STON/O2. 3101282
1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",C123,S,113803
0,"Allen, Mr. William Henry",-1,S,373450
