In [1]:
import numpy as np
import pandas as pd

In [2]:
# Construct a dict to easily choose the features used in the model. 1 means used, 0 means not used.
feature_keep_dict = {'PassengerId': 1,
                    'Pclass': 1,
                    'Name': 0,
                    'Sex': 1,
                    'Age': 1,
                    'SibSp': 1,
                    'Parch': 1,
                    'Ticket': 0,
                    'Fare': 1,
                    'Cabin': 0,
                    'Embarked': 1}

def open_and_clean_train_test_data():
    '''Opens the train.csv and test.csv dataset into pandas dataframes.
    Removes columns which will not be used and returns the features
    and target dataframes.'''
    
    X_train = pd.read_csv('train.csv')
    X_test = pd.read_csv('test.csv')
    
    y_train = X_train.Survived
    X_train.drop(['Survived'], axis=1, inplace=True)
    
    for feature, keep in feature_keep_dict.items():
        if not keep:
            X_train.drop([feature], axis=1, inplace=True)
            X_test.drop([feature], axis=1, inplace=True)
            
    return X_train, y_train, X_test

In [3]:
X_train, y_train, X_test = open_and_clean_train_test_data()

In [4]:
X_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.6625,S
4,896,3,female,22.0,1,1,12.2875,S


In [5]:
X_train.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,male,22.0,1,0,7.25,S
1,2,1,female,38.0,1,0,71.2833,C
2,3,3,female,26.0,0,0,7.925,S
3,4,1,female,35.0,1,0,53.1,S
4,5,3,male,35.0,0,0,8.05,S


In [6]:
y_train.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [7]:
X_train.count()
#Age and Embarked are shown to have missing values

PassengerId    891
Pclass         891
Sex            891
Age            714
SibSp          891
Parch          891
Fare           891
Embarked       889
dtype: int64

In [8]:
X_test.count()
#Age and fare are missing values in the test set

PassengerId    418
Pclass         418
Sex            418
Age            332
SibSp          418
Parch          418
Fare           417
Embarked       418
dtype: int64

In [9]:
def get_grouped_averages():
    '''Find the average age and fare for male/female and class 1/2/3.
    Returns a dict of results for later use.'''
    
    group_avg = X_train.groupby(['Pclass', 'Sex']).mean()
    age_group = group_avg.loc[:, 'Age']
    fare_group = group_avg.loc[:, 'Fare']
    avg_age_by_class_and_sex = {}
    avg_fare_by_class_and_sex = {}
    
    for class_type in [1, 2, 3]:
        for sex in ['female', 'male']:
            avg_age_by_class_and_sex[(class_type, sex)] = np.round(age_group[class_type][sex], 2)
            avg_fare_by_class_and_sex[(class_type, sex)] = np.round(fare_group[class_type][sex], 2)
            
    return avg_age_by_class_and_sex, avg_fare_by_class_and_sex

In [10]:
avg_age_by_class_and_sex, avg_fare_by_class_and_sex = get_grouped_averages()

In [11]:
avg_age_by_class_and_sex

{(1, 'female'): 34.61,
 (1, 'male'): 41.28,
 (2, 'female'): 28.72,
 (2, 'male'): 30.74,
 (3, 'female'): 21.75,
 (3, 'male'): 26.51}

In [12]:
avg_fare_by_class_and_sex

{(1, 'female'): 106.13,
 (1, 'male'): 67.23,
 (2, 'female'): 21.97,
 (2, 'male'): 19.74,
 (3, 'female'): 16.12,
 (3, 'male'): 12.66}

In [13]:
def build_test_train_masks():
    '''Construct masks used to filter or set missing rows in the dataframe'''
    mask_dict = {'train': {}, 'test': {}}
    for class_type in [1, 2, 3]:
        mask_dict['train'][class_type] = X_train.Pclass == class_type
        mask_dict['test'][class_type] = X_test.Pclass == class_type
    for sex in ['male', 'female']:
        mask_dict['train'][sex] = X_train.Sex == sex
        mask_dict['test'][sex] = X_test.Sex == sex
        
    return mask_dict

In [14]:
mask_dict = build_test_train_masks()

In [15]:
X_train.Embarked.value_counts()
# The majority of people embarked from 'S', so I will assume the 2 people missing embarked also came from 'S'

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [16]:
def fill_age():
    '''For train and test sets, adds missing ages depending on sex and class'''
    global X_train, X_test
    for sex in ['male', 'female']:
        for class_type in [1, 2, 3]:
            X_train.loc[mask_dict['train'][sex] 
                         & mask_dict['train'][class_type], ['Age']] = avg_age_by_class_and_sex[(class_type, sex)]
            X_test.loc[mask_dict['test'][sex] 
                         & mask_dict['test'][class_type], ['Age']] = avg_age_by_class_and_sex[(class_type, sex)]
    return

In [17]:
fill_age()

In [18]:
def fill_embarked():
    '''For train set only, adds missing embarkation port'''
    global X_train
    X_train.loc[pd.isnull(X_train.Embarked), 'Embarked'] = 'S'
    return

In [19]:
fill_embarked()

In [20]:
X_train.count()

PassengerId    891
Pclass         891
Sex            891
Age            891
SibSp          891
Parch          891
Fare           891
Embarked       891
dtype: int64

In [21]:
def fill_fare():
    '''Fills missing fares based on sex and class in the test set'''
    global X_test
    for sex in ['male', 'female']:
        for class_type in [1, 2, 3]:
            X_test.loc[mask_dict['test'][sex] 
                       & mask_dict['test'][class_type], 'Fare'] = avg_fare_by_class_and_sex[(class_type, sex)]
    return

In [22]:
fill_fare()

In [23]:
X_test.count()

PassengerId    418
Pclass         418
Sex            418
Age            418
SibSp          418
Parch          418
Fare           418
Embarked       418
dtype: int64

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

def build_model():
    '''Builds and returns a Logistic Regression model'''
    pipe = Pipeline([
    ('column_transformer', ColumnTransformer([('one_hot_class', OneHotEncoder(), ['Pclass']),
                                              ('one_hot_sex', OneHotEncoder(), ['Sex']),
                                              #('one_hot_sibsp', OneHotEncoder(), ['SibSp']),
                                              #('one_hot_parch', OneHotEncoder(), ['Parch']),
                                              ('std_scaler_age', StandardScaler(), ['Age']),
                                              ('std_scaler_fare', StandardScaler(), ['Fare']),
                                              ('one_hot_embarked', OneHotEncoder(), ['Embarked'])],
                                             )),
    ('logreg', LogisticRegression())
    ])
    
    return pipe

In [25]:
from sklearn.model_selection import GridSearchCV
def optimize_and_run_model():
    '''Uses GridSearchCV to optimize the model, returning the best model selected'''
    pipe = build_model()
    gs = GridSearchCV(pipe, param_grid={'logreg__C': np.arange(1, 2, .01)}, n_jobs=-1, cv=5, verbose=2)
    gs.fit(X_train, y_train)
    print('Best Model Params:', gs.best_params_)
    return gs

In [26]:
best_model = optimize_and_run_model()

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 245 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done 493 out of 500 | elapsed:   15.3s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   15.5s finished
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Best Model Params: {'logreg__C': 1.6200000000000006}


In [27]:
best_model.best_params_

{'logreg__C': 1.6200000000000006}

In [28]:
y_pred = best_model.predict(X_test.drop('PassengerId', axis=1))

In [29]:
solution = X_test.loc[:, ['PassengerId']]
solution.head()

Unnamed: 0,PassengerId
0,892
1,893
2,894
3,895
4,896


In [30]:
solution['Survived'] = y_pred
solution.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [31]:
solution.to_csv('solution.csv', index=False)
#Scored 0.75119 logreg