In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import mglearn
from IPython.display import display

%matplotlib inline


In [2]:
def unique_values(dataset, feature):
    return dataset[feature].unique()

def one_hotter_encoding(dataset, features):
    encoded = dataset.copy()
    
    for feature in features:
        unique_vals = unique_values(encoded, feature)
        for value in unique_vals:
            encoded[f"{feature}_{value}"] = (encoded[feature] == value).astype(float)
    
    return encoded    
    

In [3]:
pd.options.display.max_rows = 30

# MODEL BUILDING #

### Read Data ###

In [4]:
data = pd.read_csv('data/train.csv', index_col='PassengerId')
data.columns
data = data.append(pd.DataFrame({
    'Survived': [0],
    'Pclass': [3],
    'Name': ['Nobody Dona. Who'],
    'Sex': ['male'],
    'Age': [38],
    'SibSp': [1],
    'Parch': [0],
    'Ticket': ['NOMATTER'],
    'Fare' : [32],
    'Cabin' : ['NOMATTER'],
    'Embarked': ['S']
}))

### Title ###

In [5]:
def add_titles(data):
    result = data.copy()
    result['Title'] = result.Name.str.extract('([A-Za-z]+)\.', expand=False)
    result.loc[result.Title == 'Mlle', 'Title'] = 'Miss'
    result.loc[result.Title == 'Mme', 'Title']  = 'Mrs'
    result.loc[result.Title == 'Ms', 'Title']   = 'Miss'
    rare_titles = ['Dr', 'Rev', 'Col', 'Major', 'Don', 'Capt', 'Countess','Jonkheer', 'Sir', 'Lady', 'Dona']

    result.Title = result.Title.replace(rare_titles, 'Rare')
    
    return result

In [6]:
data = add_titles(data)

### Finding Age by Title ###

In [7]:
def fill_age_by_title(data):
    result = data.copy()
    age_by_title = result.groupby('Title').Age.mean()
    result.loc[result.Age.isnull() & (result.Title == 'Mr'), 'Age'] = age_by_title['Mr']
    result.loc[result.Age.isnull() & (result.Title == 'Mrs'), 'Age'] = age_by_title['Mrs']
    result.loc[result.Age.isnull() & (result.Title == 'Miss'), 'Age'] = age_by_title['Miss']
    result.loc[result.Age.isnull() & (result.Title == 'Master'), 'Age'] = age_by_title['Master']
    result.loc[result.Age.isnull() & (result.Title == 'Rare'), 'Age'] = age_by_title['Rare']
    
    return result

In [8]:
data = fill_age_by_title(data)

In [9]:
def fill_embarked_drop_cabin(data):
    result = data.copy()
    result.Embarked = result.Embarked.fillna('S')
    result = result.drop('Cabin', axis=1)
    
    return result

In [10]:
data = fill_embarked_drop_cabin(data)

### Family Size ###

In [11]:
def build_family(data):
    result = data.copy()
    result['FamilySize'] = result.Parch + result.SibSp + 1
    result = result.drop(['Parch', 'SibSp'], axis=1)
    
    return result

data = build_family(data)



In [12]:
pretty_data = data.copy()

### ENCODE THESE FEATURES ###

In [13]:
encoded_data = one_hotter_encoding(pretty_data, ['Pclass', 'Sex', 'Embarked'])

In [14]:
encoded_data = encoded_data.drop("Sex_female", axis=1)

In [15]:
encoded_data = one_hotter_encoding(encoded_data, ['Title'])

### DROPPING THESE FEATURES ###

In [16]:
encoded_data = encoded_data.drop(['Ticket', 'Name', 'Pclass', 'Sex', 'Embarked','Title'], axis=1)

### Age Grouping ### 

In [None]:
def age_grouping(data):
    result = data.copy()
    result['AgeGroup1'] = (result.Age < 10).astype(float)
    result['AgeGroup2'] = ((10 <= result.Age) & (result.Age < 16)).astype(float)
    result['AgeGroup3'] = ((16 <= result.Age) & (result.Age < 25)).astype(float)
    result['AgeGroup4'] = ((25 <= result.Age) & (result.Age < 45)).astype(float)
    result['AgeGroup5'] = (40 <= result.Age).astype(float)
    
    return result

encoded_data = age_grouping(encoded_data)

### Dropping Age ###

In [None]:
encoded_data = encoded_data.drop('Age', axis=1)

### Family Size Grouping ###

In [None]:
def family_size_grouping(data):
    result = data.copy()        
    result['IsAlone'] = (result.FamilySize == 1).astype(float)
    result['IsSmallFamily'] = ((2 <= result.FamilySize) & (result.FamilySize < 5)).astype(float)
    result['IsLargeFamily'] = (5 <= result.FamilySize).astype(float)
    
    return result


encoded_data = family_size_grouping(encoded_data)

### Droping Family Size ###

In [17]:
encoded_data = encoded_data.drop('FamilySize', axis=1)

### Dropping Fare  ###

In [18]:
encoded_data = encoded_data.drop('Fare',axis=1)

### Train Test Split ###

In [19]:
from sklearn.model_selection import train_test_split

X = encoded_data.drop('Survived', axis=1)
y = encoded_data['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y)


### RANDOM FOREST Model ###

In [20]:
from sklearn.ensemble import RandomForestClassifier
rff_model = RandomForestClassifier(random_state=0, n_jobs=-1).fit(X_train, y_train)
print("train score:", rff_model.score(X_train, y_train))
print("test score: ", rff_model.score(X_test, y_test))

train score: 0.9177877429
test score:  0.793721973094


 ### LOGISTIC REGRESSION MODEL ###

In [21]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression().fit(X_train, y_train)
print("train score: ", log_model.score(X_train, y_train))
print("train score: ", log_model.score(X_test, y_test))

train score:  0.811659192825
train score:  0.780269058296


### GRID SEARCH LOGISTIC ###

In [22]:
from sklearn.model_selection import GridSearchCV

search = GridSearchCV(log_model, {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]})
search.fit(X, y)

pd.DataFrame(search.cv_results_)[['rank_test_score', 'mean_test_score', 'mean_train_score', 'params']].sort_values(by='rank_test_score').head(10)

Unnamed: 0,rank_test_score,mean_test_score,mean_train_score,params
2,1,0.802691,0.815589,{'C': 0.1}
1,2,0.79148,0.802698,{'C': 0.01}
4,2,0.79148,0.815028,{'C': 10}
5,2,0.79148,0.815589,{'C': 100}
6,2,0.79148,0.815589,{'C': 1000}
3,6,0.789238,0.812228,{'C': 1}
0,7,0.618834,0.618271,{'C': 0.001}


### GRID SEARCH RFF ###

In [23]:
from sklearn.model_selection import GridSearchCV

search = GridSearchCV(rff_model, {'n_estimators': [10, 30, 50, 70, 80, 100],
                                  'max_depth': [2, 4, 6, 8, 10, 12, 15]})
search.fit(X, y)

pd.DataFrame(search.cv_results_)[['rank_test_score', 'mean_test_score', 'mean_train_score', 'params']].sort_values(by='rank_test_score').head(10)

Unnamed: 0,rank_test_score,mean_test_score,mean_train_score,params
17,1,0.815022,0.867719,"{'max_depth': 6, 'n_estimators': 100}"
16,1,0.815022,0.866038,"{'max_depth': 6, 'n_estimators': 80}"
12,1,0.815022,0.864917,"{'max_depth': 6, 'n_estimators': 10}"
15,4,0.813901,0.867158,"{'max_depth': 6, 'n_estimators': 70}"
13,4,0.813901,0.866598,"{'max_depth': 6, 'n_estimators': 30}"
22,4,0.813901,0.89799,"{'max_depth': 8, 'n_estimators': 80}"
20,7,0.811659,0.899673,"{'max_depth': 8, 'n_estimators': 50}"
21,7,0.811659,0.896869,"{'max_depth': 8, 'n_estimators': 70}"
9,7,0.811659,0.83577,"{'max_depth': 4, 'n_estimators': 70}"
14,10,0.810538,0.866599,"{'max_depth': 6, 'n_estimators': 50}"


### LAST TIME ###

In [24]:
rff_model = RandomForestClassifier(random_state=0,n_jobs=-1, max_depth=6, n_estimators=100).fit(X_train, y_train)
print("train score:", rff_model.score(X_train, y_train))
print("test score: ", rff_model.score(X_test, y_test))

train score: 0.860986547085
test score:  0.816143497758


In [25]:
log_model = LogisticRegression(C=0.1).fit(X_train, y_train)
print("train score:", log_model.score(X_train, y_train))
print("test score: ", log_model.score(X_test, y_test))

train score: 0.807174887892
test score:  0.784753363229


# TEST FORMATTING #

In [26]:
test = pd.read_csv('data/test.csv', index_col=['PassengerId'])

### Titles ###

In [27]:
test = add_titles(test)
test = fill_age_by_title(test)

### Embarked ###

In [28]:
test = fill_embarked_drop_cabin(test)

### Family Size ###

In [29]:
test = build_family(test)

### Encoding PCLASS SEX AND EMBARKED ###

In [30]:
test = one_hotter_encoding(test, ['Pclass', 'Sex', 'Embarked'])

### Encoding 2 SEXES into 1 column ### 

In [31]:
test = test.drop("Sex_female", axis=1)


### Encoding Title ###

In [32]:
test = one_hotter_encoding(test, ['Title'])

### Dropping the features ###

In [33]:
test = test.drop(['Ticket', 'Name', 'Pclass', 'Sex', 'Embarked','Title'], axis=1)

### Age grouping ###

In [None]:
test = age_grouping(test)

### Dropping Age ###

In [None]:
test = test.drop('Age', axis=1)

### Familiy Size Grouping ###

In [None]:
test = family_size_grouping(test)

### Dropping Family ###

In [34]:
test = test.drop('FamilySize', axis=1)

### Dropping Fare Price ###

In [35]:
test = test.drop('Fare',axis=1)

In [None]:
test.Fare = test.Fare.fillna(35)

In [36]:
set(test.columns) - set(encoded_data.columns)

set()

In [37]:
set(encoded_data.columns) - set(test.columns)

{'Survived'}

# PREDICTING #

In [38]:
predictions = rff_model.predict(test)

frame = pd.DataFrame({
    'PassengerId': pd.read_csv('data/test.csv').PassengerId,
    'Survived': predictions
})
frame = frame.set_index('PassengerId')
frame.to_csv('/predictions/rff_predictions.csv')
frame.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,1


In [39]:
log_predictions = log_model.predict(test)
frame = pd.DataFrame({
    'PassengerId': pd.read_csv('data/test.csv').PassengerId,
    'Survived': log_predictions
})
frame = frame.set_index('PassengerId')
frame.to_csv('/predictions/log_predictions.csv')
frame.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,1
894,0
895,0
896,1
