In [None]:
################ Load datasets ################

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Load the train and test datasets to create two DataFrames
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
train = pd.read_csv(train_url)

test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
test = pd.read_csv(test_url)

train.head()

In [None]:
################ Basic Manipulations & Preliminary Analysis ################

# Passengers that survived vs passengers that passed away
print(train['Survived'].value_counts())

tot = len(train.index)
surv = train['Survived'].sum()
dead = train['Survived'].loc[train['Survived']==0].count()
print(surv/dead)
# Equiv print(surv/(tot-surv))

print(train['Survived'].loc[ (train['Sex']=='male') & (train['Survived']==1) ].value_counts())

# As proportions
print(train["Survived"].value_counts(normalize=True))

# Males that survived vs males that passed away
print(train.groupby('Sex')['Survived'].value_counts())
print(train.groupby('Sex')['Survived'].value_counts(normalize=True))

# Add a column and affect value
train['Child'] = float('NaN')
train['Child'][train['Age']<18] = 1
train['Child'][train['Age']>=18] = 0
train['Child'].describe()

# Plot survival rate given age bucket
bins = np.linspace(0, train.Age.max(), 10)
a = train['Survived'].groupby(np.digitize(train.Age, bins)).value_counts(normalize=True)
a.unstack(level=1).plot(kind='bar', stacked=True, width = 1.)

In [None]:
################ First Tree ################

######### train set #########

# Convert the male and female groups to integer form
train["Sex"][train["Sex"] == "male"] = 0
train["Sex"][train["Sex"] == "female"] = 1

# Impute the Embarked variable
train["Embarked"] = train["Embarked"].fillna('S')

# Convert the Embarked classes to integer form
train["Embarked"][train["Embarked"] == "S"] = 0
train["Embarked"][train["Embarked"] == "C"] = 1
train["Embarked"][train["Embarked"] == "Q"] = 2

# Fill NaN Age values with median
train["Age"] = train["Age"].fillna(train["Age"].median())

# Create the target and features numpy arrays: target, features_one
target = train['Survived'].values
features_one = train[["Pclass", "Sex", "Age", "Fare"]].values

# Fit your first decision tree: my_tree_one
from sklearn import tree
my_tree_one = tree.DecisionTreeClassifier()
my_tree_one = my_tree_one.fit(features_one, target)

# Look at the importance and score of the included features
print(my_tree_one.feature_importances_)
print(my_tree_one.score(features_one, target))


######### test set #########

# Fill missing value with the median
test.Fare = test.Fare.fillna(test.Fare.median())
# Alernative : test['Fare'] = test['Fare'].fillna(test['Fare'].median())
test["Age"] = test["Age"].fillna(test["Age"].median())
test["Sex"][test["Sex"] == "male"] = 0
test["Sex"][test["Sex"] == "female"] = 1

# Extract the features from the test set: Pclass, Sex, Age, and Fare.
test_features = test[["Pclass", "Sex", "Age", "Fare"]].values

test[["Pclass", "Sex", "Age", "Fare"]].describe()

# Make your prediction using the test set
my_prediction = my_tree_one.predict(test_features)

# Create a data frame with two columns: PassengerId & Survived. Survived contains your predictions
PassengerId = np.array(test["PassengerId"]).astype(int)
my_solution = pd.DataFrame(my_prediction, PassengerId, columns = ["Survived"])

# Write your solution to a csv file with the name my_solution.csv
my_solution.to_csv("my_solution_one.csv", index_label = ["PassengerId"])

In [None]:
################ Second Tree : Regularization ################

train[["Pclass","Age","Sex","Fare", 'SibSp', 'Parch', 'Embarked']].describe()

# Create a new array with the added features: features_two
features_two = train[["Pclass","Age","Sex","Fare", 'SibSp', 'Parch', 'Embarked']].values

# Control overfitting by setting "max_depth" to 10 and "min_samples_split" to 5 : my_tree_two
max_depth = 10
min_samples_split = 5
my_tree_two = tree.DecisionTreeClassifier(max_depth = max_depth, min_samples_split = min_samples_split, random_state = 1)
my_tree_two = my_tree_two.fit(features_two, target)

# Look at the importance and score of the included features
print(my_tree_two.feature_importances_)
print(my_tree_two.score(features_two, target))

In [None]:
################ Third Tree : Feature-engineering ################

# Create train_two with the newly defined feature
train_two = train.copy()
train_two["family_size"] = train_two["SibSp"]+train_two["Parch"]+1

# Create a new feature set and add the new feature
features_three = train_two[["Pclass", "Sex", "Age", "Fare", "SibSp", "Parch", "family_size"]].values

# Define the tree classifier, then fit the model
my_tree_three = tree.DecisionTreeClassifier()
my_tree_three = my_tree_three.fit(features_three, target)

# Print the score of this decision tree
print(my_tree_three.score(features_three, target))

In [None]:
################ Random Forest ################

# Import the `RandomForestClassifier`
from sklearn.ensemble import RandomForestClassifier

# We want the Pclass, Age, Sex, Fare,SibSp, Parch, and Embarked variables
features_forest = train[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values

# Building and fitting my_forest
forest = RandomForestClassifier(max_depth = 10, min_samples_split=2, n_estimators = 100, random_state = 1)
my_forest = forest.fit(features_forest, target)

# Print the score of the fitted random forest
print(my_forest.score(features_forest, target))

# Compute predictions on our test set features
test["Embarked"][test["Embarked"] == "S"] = 0
test["Embarked"][test["Embarked"] == "C"] = 1
test["Embarked"][test["Embarked"] == "Q"] = 2
test_features = test[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values
pred_forest = my_forest.predict(test_features)