In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import mglearn
from IPython.display import display

%matplotlib inline


In [None]:
def unique_values(dataset, feature):
    return dataset[feature].unique()

def one_hotter_encoding(dataset, features):
    encoded = dataset.copy()
    
    for feature in features:
        unique_vals = unique_values(encoded, feature)
        for value in unique_vals:
            encoded[f"{feature}_{value}"] = (encoded[feature] == value).astype(float)
    
    return encoded    
    

In [None]:
pd.options.display.max_rows = 30

# MODEL BUILDING #

### Read Data ###

In [None]:
data = pd.read_csv('data/train.csv', index_col='PassengerId')
data.columns
data = data.append(pd.DataFrame({
    'Survived': [0],
    'Pclass': [3],
    'Name': ['Nobody Dona. Who'],
    'Sex': ['male'],
    'Age': [38],
    'SibSp': [1],
    'Parch': [0],
    'Ticket': ['NOMATTER'],
    'Fare' : [32],
    'Cabin' : ['NOMATTER'],
    'Embarked': ['S']
}))

### Examine Data ###

### Title ###

In [None]:
def add_titles(data):
    result = data.copy()
    result['Title'] = result.Name.str.extract('([A-Za-z]+)\.', expand=False)
    result.loc[result.Title == 'Mlle', 'Title'] = 'Miss'
    result.loc[result.Title == 'Mme', 'Title']  = 'Mrs'
    result.loc[result.Title == 'Ms', 'Title']   = 'Miss'
    rare_titles = ['Dr', 'Rev', 'Col', 'Major', 'Don', 'Capt', 'Countess','Jonkheer', 'Sir', 'Lady', 'Dona']

    result.Title = result.Title.replace(rare_titles, 'Rare')
    
    return result

In [None]:
data = add_titles(data)

### Finding Age by Mean ###

In [None]:
data['Age'] = data['Age'].fillna(data.Age.mean())

### Finding Age by Title ###

In [None]:
def fill_age_by_title(data):
    result = data.copy()
    age_by_title = result.groupby('Title').Age.mean()
    result.loc[result.Age.isnull() & (result.Title == 'Mr'), 'Age'] = age_by_title['Mr']
    result.loc[result.Age.isnull() & (result.Title == 'Mrs'), 'Age'] = age_by_title['Mrs']
    result.loc[result.Age.isnull() & (result.Title == 'Miss'), 'Age'] = age_by_title['Miss']
    result.loc[result.Age.isnull() & (result.Title == 'Master'), 'Age'] = age_by_title['Master']
    result.loc[result.Age.isnull() & (result.Title == 'Rare'), 'Age'] = age_by_title['Rare']
    
    return result

In [None]:
data = fill_age_by_title(data)

In [None]:
def fill_embarked_drop_cabin(data):
    result = data.copy()
    result.Embarked = result.Embarked.fillna('S')
    result = result.drop('Cabin', axis=1)
    
    return result

In [None]:
data = fill_embarked_drop_cabin(data)

### Family Size ###

In [None]:
def build_family(data):
    result = data.copy()
    result['FamilySize'] = result.Parch + result.SibSp + 1
    result = result.drop(['Parch', 'SibSp'], axis=1)
    
    return result

data = build_family(data)



### ENCODE THESE FEATURES ###

In [None]:
data = one_hotter_encoding(data, ['Pclass', 'Sex', 'Embarked'])

In [None]:
data = data.drop("Sex_female", axis=1)

In [None]:
data = one_hotter_encoding(data, ['Title'])

### DROPPING THESE FEATURES ###

In [None]:
data = data.drop(['Ticket', 'Name', 'Pclass', 'Sex', 'Embarked','Title'], axis=1)

### Age Grouping ### 

In [None]:
def age_grouping(data):
    result = data.copy()
    result['AgeGroup1'] = (result.Age < 17).astype(float)
    result['AgeGroup2'] = ((17 <= result.Age) & (result.Age < 25)).astype(float)
    result['AgeGroup3'] = ((25 <= result.Age) & (result.Age < 45)).astype(float)
    result['AgeGroup5'] = (45 <= result.Age).astype(float)
    
    return result

data = age_grouping(data)

### Dropping Age ###

In [None]:
data = data.drop('Age', axis=1)

### Family Size Grouping ###

In [None]:
def family_size_grouping(data):
    result = data.copy()        
    result['IsAlone'] = (result.FamilySize == 1).astype(float)
    result['IsSmallFamily'] = ((2 <= result.FamilySize) & (result.FamilySize < 5)).astype(float)
    result['IsLargeFamily'] = (5 <= result.FamilySize).astype(float)
    
    return result


data = family_size_grouping(data)

### Droping Family Size ###

In [None]:
data = data.drop('FamilySize', axis=1)

### Dropping Fare  ###

In [None]:
data = data.drop('Fare',axis=1)

### Train Test Split ###

In [None]:
from sklearn.model_selection import train_test_split

X = data.drop('Survived', axis=1)
y = data['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y)


### RANDOM FOREST Model ###

In [None]:
from sklearn.ensemble import RandomForestClassifier
rff_model = RandomForestClassifier(n_jobs=-1).fit(X_train, y_train)
print("train score:", rff_model.score(X_train, y_train))
print("test score: ", rff_model.score(X_test, y_test))

 ### LOGISTIC REGRESSION MODEL ###

In [None]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression().fit(X_train, y_train)
print("train score: ", log_model.score(X_train, y_train))
print("train score: ", log_model.score(X_test, y_test))

### GRID SEARCH LOGISTIC ###

In [None]:
from sklearn.model_selection import GridSearchCV

search = GridSearchCV(log_model, {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]})
search.fit(X, y)

pd.DataFrame(search.cv_results_)[['rank_test_score', 'mean_test_score', 'mean_train_score', 'params']].sort_values(by='rank_test_score').head(10)

### GRID SEARCH RFF ###

In [None]:
from sklearn.model_selection import GridSearchCV

search = GridSearchCV(rff_model, {'n_estimators': [10, 30, 50, 70, 80, 100],
                                  'max_depth': [2, 4, 6, 8, 10, 12, 15]})
search.fit(X, y)

pd.DataFrame(search.cv_results_)[['rank_test_score', 'mean_test_score', 'mean_train_score', 'params']].sort_values(by='rank_test_score').head(10)

### LAST TIME ###

In [None]:
rff_model = RandomForestClassifier(n_jobs=-1, max_depth=4, n_estimators=80, min_samples_split=15).fit(X_train, y_train)
print("train score:", rff_model.score(X_train, y_train))
print("test score: ", rff_model.score(X_test, y_test))

In [None]:
log_model = LogisticRegression(C=0.1).fit(X_train, y_train)
print("train score:", log_model.score(X_train, y_train))
print("test score: ", log_model.score(X_test, y_test))

# TEST FORMATTING #

In [None]:
test = pd.read_csv('data/test.csv', index_col=['PassengerId'])

### Titles ###

In [None]:
test = add_titles(test)
test = fill_age_by_title(test)

### Embarked ###

In [None]:
test = fill_embarked_drop_cabin(test)

### Family Size ###

In [None]:
test = build_family(test)

### Encoding PCLASS SEX AND EMBARKED ###

In [None]:
test = one_hotter_encoding(test, ['Pclass', 'Sex', 'Embarked'])

### Encoding 2 SEXES into 1 column ### 

In [None]:
test = test.drop("Sex_female", axis=1)


### Encoding Title ###

In [None]:
test = one_hotter_encoding(test, ['Title'])

### Dropping the features ###

In [None]:
test = test.drop(['Ticket', 'Name', 'Pclass', 'Sex', 'Embarked','Title'], axis=1)

### Age grouping ###

In [None]:
test = age_grouping(test)

### Dropping Age ###

In [None]:
test = test.drop('Age', axis=1)

### Familiy Size Grouping ###

In [None]:
test = family_size_grouping(test)

### Dropping Family ###

In [None]:
test = test.drop('FamilySize', axis=1)

### Dropping Fare Price ###

In [None]:
test = test.drop('Fare',axis=1)

In [None]:
test.Fare = test.Fare.fillna(35)

In [None]:
set(test.columns) - set(data.columns)

In [None]:
set(data.columns) - set(test.columns)

# PREDICTING #

In [None]:
!ls

In [None]:
predictions = rff_model.predict(test)

frame = pd.DataFrame({
    'PassengerId': pd.read_csv('data/test.csv').PassengerId,
    'Survived': predictions
})
frame = frame.set_index('PassengerId')
frame.to_csv('predictions/rff_predictions.csv')
frame.head()

In [None]:
log_predictions = log_model.predict(test)
frame = pd.DataFrame({
    'PassengerId': pd.read_csv('data/test.csv').PassengerId,
    'Survived': log_predictions
})
frame = frame.set_index('PassengerId')
frame.to_csv('predictions/log_predictions.csv')
frame.head()