In [134]:
import pandas
data = pandas.read_csv("train.csv")

# Print the first 5 rows of the dataframe.
print data.head(5)
print data.describe()

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex  Age  SibSp  \
0                            Braund, Mr. Owen Harris    male   22      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   38      1   
2                             Heikkinen, Miss. Laina  female   26      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female   35      1   
4                           Allen, Mr. William Henry    male   35      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
       P

In case "Age" is a NaN we can substitute the cases where this is true with the median value of all ages we are provided with.

In [135]:
data["Age"] = data["Age"].fillna(data["Age"].median())

In addition to the above, we detect how many different types of "Sex" exist and then recode then to discretize them into 1 and 0, for "female" and "male", respectively.

In [136]:
print data["Sex"].unique()

# Replace all the occurences of male with the number 0.
data.loc[data["Sex"] == "male", "Sex"] = 0
data.loc[data["Sex"] == "female", "Sex"] = 1

['male' 'female']


We will apply now a similar technique for the "Embarked" category and assign values to the "S", "C", and "Q" categories.

In [137]:
print(data["Embarked"].unique())
data["Embarked"] = data["Embarked"].fillna("S")
data.loc[data["Embarked"] == "S", "Embarked"] = 0
data.loc[data["Embarked"] == "C", "Embarked"] = 1
data.loc[data["Embarked"] == "Q", "Embarked"] = 2

['S' 'C' 'Q' nan]


In [138]:
# Import the linear regression class
from sklearn.linear_model import LinearRegression
# Sklearn also has a helper that makes it easy to do cross validation
from sklearn.cross_validation import KFold

# The columns we'll use to predict the target
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

# Initialize our algorithm class
alg = LinearRegression()
# Generate cross validation folds for the titanic dataset.  It return the row indices corresponding to train and test.
# We set random_state to ensure we get the same splits every time we run this.
kf = KFold(data.shape[0], n_folds=3, random_state=1)

predictions = []
for train, test in kf:
    # The predictors we're using the train the algorithm.  Note how we only take the rows in the train folds.
    train_predictors = (data[predictors].iloc[train,:])
    # The target we're using to train the algorithm.
    train_target = data["Survived"].iloc[train]
    # Training the algorithm using the predictors and target.
    alg.fit(train_predictors, train_target)
    # We can now make predictions on the test fold
    test_predictions = alg.predict(data[predictors].iloc[test,:])
    predictions.append(test_predictions)


By finding the number of values in predictions that are the exact same as their counterparts in data["Survived"], and then dividing by the total number of passengers, we can deduce a model that returns an accuracy value.

In [139]:
import numpy as np

# The predictions are in three separate numpy arrays.  Concatenate them into one.  
# We concatenate them on axis 0, as they only have one axis.
predictions = np.concatenate(predictions, axis=0)

# Map predictions to outcomes (only possible outcomes are 1 and 0)
predictions[predictions > .5] = 1
predictions[predictions <=.5] = 0

accuracy = float(sum(predictions == data["Survived"])) / len(predictions)

print accuracy

0.783389450056


We want to improve that accuracy value, as it is just 78.3%

In [140]:
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression

# Initialize our algorithm
alg = LogisticRegression(random_state=1)
# Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)
scores = cross_validation.cross_val_score(alg, data[predictors], data["Survived"], cv=3)
# Take the mean of the scores (because we have one for each fold)
print(scores.mean())

0.787878787879


Now let's implement the exact same procedure for our test data, as we finished working with our train data.

In [141]:
titanic_test = pandas.read_csv("test.csv")

# Fill in age value that are NaNs with the median
titanic_test["Age"] = titanic_test["Age"].fillna(titanic_test["Age"].median())

# Encode the recodes for "Sex", "Embarked"
titanic_test.loc[titanic_test["Sex"] == "male", "Sex"] = 0
titanic_test.loc[titanic_test["Sex"] == "female", "Sex"] = 1

titanic_test["Embarked"] = titanic_test["Embarked"].fillna("S")
titanic_test.loc[titanic_test["Embarked"] == "S", "Embarked"] = 0
titanic_test.loc[titanic_test["Embarked"] == "C", "Embarked"] = 1
titanic_test.loc[titanic_test["Embarked"] == "Q", "Embarked"] = 2

# Handle NaNs for "Fare" column
titanic_test["Fare"] = titanic_test["Fare"].fillna(titanic_test["Fare"].median())

In [142]:
# Initialize the algorithm class
alg = LogisticRegression(random_state=1)

# Train the algorithm using all the training data
alg.fit(data[predictors], data["Survived"])

# Make predictions using the test set.
predictions = alg.predict(titanic_test[predictors])

# Create a new dataframe with only the columns Kaggle wants from the dataset.
submission = pandas.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": predictions
    })
    
submission.to_csv("kaggle.csv", index=False)

Iteration 1 - Time to improve our accuracy!

As it was pointed out in class, it is good to encode data in a manner that is not misleading. Hence, I decided to transform the Embarked parameter in two only 0 and 1 and not 2.

In [143]:
# Handle "S"
data.loc[data["Embarked"] == 0, "Embarked_S_New"] = 1
data["Embarked_S_New"] = data["Embarked_S_New"].fillna(0)

# Handle "C"
data.loc[data["Embarked"] == 1, "Embarked_C_New"] = 1
data["Embarked_C_New"] = data["Embarked_C_New"].fillna(0)

# Handle "Q"
data.loc[data["Embarked"] == 1, "Embarked_Q_New"] = 1
data["Embarked_Q_New"] = data["Embarked_Q_New"].fillna(0)

Using the same predictors as before adding the new Embarked values let's get a score back.

In [144]:
predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_S_New', 'Embarked_C_New', 'Embarked_Q_New']

alg = LogisticRegression(random_state=1)

scores = cross_validation.cross_val_score(alg, data[predictors], data['Survived'], cv=3)

print scores.mean()

0.785634118967


Cool stuff, the prediction actually wasn't very good. Will submit and try another idea that comes to mind. Let's do what we just did in the test data.

In [145]:
titanic_test = pandas.read_csv("test.csv")

# Fill in age value that are NaNs with the median
titanic_test["Age"] = titanic_test["Age"].fillna(titanic_test["Age"].median())

# Encode the recodes for "Sex", "Embarked"
titanic_test.loc[titanic_test["Sex"] == "male", "Sex"] = 0
titanic_test.loc[titanic_test["Sex"] == "female", "Sex"] = 1

titanic_test["Embarked"] = titanic_test["Embarked"].fillna("S")
titanic_test.loc[titanic_test["Embarked"] == "S", "Embarked"] = 0
titanic_test.loc[titanic_test["Embarked"] == "C", "Embarked"] = 1
titanic_test.loc[titanic_test["Embarked"] == "Q", "Embarked"] = 2

# Handle NaNs for "Fare" column
titanic_test["Fare"] = titanic_test["Fare"].fillna(titanic_test["Fare"].median())
titanic_test.loc[titanic_test["Embarked"] == 0, "Embarked_S_New"] = 1
titanic_test["Embarked_S_New"] = titanic_test["Embarked_S_New"].fillna(0)

titanic_test.loc[titanic_test["Embarked"] == 1, "Embarked_C_New"] = 1
titanic_test["Embarked_C_New"] = titanic_test["Embarked_C_New"].fillna(0)

titanic_test.loc[titanic_test["Embarked"] == 1, "Embarked_Q_New"] = 1
titanic_test["Embarked_Q_New"] = titanic_test["Embarked_Q_New"].fillna(0)

alg = LogisticRegression(random_state=1)

alg.fit(data[predictors], data['Survived'])

predictions = alg.predict(titanic_test[predictors])

submission = pandas.DataFrame({
        'PassengerId': titanic_test['PassengerId'],
        'Survived': predictions
    })

submission.to_csv('kaggle.csv', index=False)

Iteration 2

My final idea of improvind the model is to consider people of lower ages, and see how the survical of younger kids affect the survival prediciton. I have a good feeling about this :)

In [146]:
data["ChildAge"] = data["Age"]
data.loc[data["ChildAge"] <= 12, "ChildAge"] = 1
data.loc[data["ChildAge"] >12, "ChildAge"] = 0

Now the predictors have indeed changed to include the ChildAge. Let's implement again the LogisticRegression and checkout the scores.

In [147]:
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "ChildAge"]

In [148]:
alg = LogisticRegression(random_state=1)

scores = cross_validation.cross_val_score(alg, data[predictors], data['Survived'], cv=3)

print scores.mean()

0.804713804714


Nice. I wonder if we shifted the 12 years old threshold lower or higher and see the effect on the system's outcome. Implementing the new parameter on to our test data we get the following

In [151]:
titanic_test["ChildAge"] = titanic_test["Age"]
titanic_test.loc[titanic_test["ChildAge"] <= 7, "ChildAge"] = 1
titanic_test.loc[titanic_test["ChildAge"] >7, "ChildAge"] = 0

alg = LogisticRegression(random_state=1)

alg.fit(data[predictors], data["Survived"])

predictions = alg.predict(titanic_test[predictors])

submission = pandas.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": predictions
    })

submission.to_csv("kaggleImprovedAge.csv", index=False)