In [None]:
##Loading Data

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#Read in data
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')

print(train_df.head())

In [None]:
##Setting 
train_y = train_df.Survived
passenger_features = ["Sex", "Age", "Pclass", "SibSp", "Parch", "Fare"] #, "Cabin", "Embarked", "Ticket"]
train_X = train_df[passenger_features]
test_X = test_df[passenger_features]

print(train_X.head())

In [None]:
##Convert Categorical Data
def convert(df):
    sex = df.Sex.tolist()
    new_sex = []
    for s in sex:
        ns = 1 if s=='male' else 0
        new_sex.append(ns)
    df.Sex = new_sex
convert(train_X)
convert(test_X)
#print(test_X.describe())
#print(train_X.describe())

In [None]:
##Calculate for Missing Values
from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(train_X))
imputed_X_test = pd.DataFrame(my_imputer.transform(test_X))

# Imputation removed column names; put them back
imputed_X_train.columns = train_X.columns
imputed_X_test.columns = test_X.columns

cols_with_missing1 = [col for col in train_X.columns
                     if train_X[col].isnull().any()]
cols_with_missing2 = [col for col in test_X.columns
                     if test_X[col].isnull().any()]

#print(cols_with_missing1)
#print(cols_with_missing2)

train_X = imputed_X_train
test_X = imputed_X_test

#print(test_X.describe())
#print(train_X.describe())

In [None]:
##Calculate Predictions
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor(random_state=1, n_estimators=100)
forest_model.fit(train_X, train_y)

preds = forest_model.predict(test_X)
for i in range(len(preds)):
    if preds[i] > 0.5:
        preds[i] = 1
    else:
        preds[i] = 0

print("PassengerId, Survived")
pID = test_df.PassengerId.tolist()
for i in range(len(preds)):
    print(str(pID[i]) + "," + str(int(preds[i])))

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

def bar_graph(labels, results):
    i = np.arrange(len(labels))
    plt.bar(i, results)
    plt.xlabel('Gender', fontsize=5)
    plt.ylabel('Percent Survived', fontsize=5)
    plt.xticks(i, labels, fontsize=5, rotation=30)
    plt.title('Predicted Titanic Survivors by Gender')
    plt.show()

In [None]:
from sklearn.tree import DecisionTreeRegressor

passenger_model = DecisionTreeRegressor(random_state=1)
passenger_model.fit(train_X, train_y)
predicted_survivals = passenger_model.predict(test_X)

#output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': predicted_survivals})
#output.to_csv('submission.csv', index=False)

In [None]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y, predicted_survivals)

In [None]:
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

survival_model = DecisionTreeRegressor()

survival_model.fit(train_X, train_y)

val_predictions = survival_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

for max_leaf_nodes in [5,50,500,5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))