In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import linear_model

np.random.seed(1234)

In [None]:
# Get the training data and use panda to read it
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
train = pd.read_csv(train_url)
test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
test = pd.read_csv(test_url)

In [None]:
# Let's look at the some example data
train.head()
# Notice how some data is missing (NaN)

In [None]:
# We can also use describe to get an overall statistics
train.describe()

Looking at the data, you will notice a lot of missing values. For example, some age is NaN. This is normal for real world data to have some missing values. There are several ways to handle missing values. The simplest is to throw away any rows that have missing values. However, this usually reduce the amount of training data you have. Another method is to guess what the missing value should be. The simplest guess is to use the Median or Mode of the data. For this exercise we will proceed with this.

For example we can replace the age with the median by this command
```
train["Age"] = train["Age"].fillna(train["Age"].median())
```

In [None]:
# Complete the following function
# feature_preprocess
# Input: panda data frame
# Output: processed panda data frame
# This function does the following
# 1. Changes "S", "C", and "Q" in the "Embarked" category to 0, 1, and 2
# 2. Changes "male" and "female" in the "Sex" category to 0, and 1
# 3. Fills Age, PClass, Sex, and Embarked with the median/modes values.
# 4. Normalize the data to have similar ranges. (This is required for linear regression)

def feature_preprocess(data_in):
    data = data_in.copy()
    
    data.loc[data["Embarked"] == "S", "Embarked"] = 0
    ## TODO complete the rest of the conversion
    
    data.loc[data["Embarked"] == "C", "Embarked"] = 1
    data.loc[data["Embarked"] == "Q", "Embarked"] = 2
    data.loc[data["Sex"] == "male", "Sex"] = 0
    data.loc[data["Sex"] == "female", "Sex"] = 1
    

    data["Age"] = data["Age"].fillna(train["Age"].median())
    data["Embarked"] = data["Embarked"].fillna(data["Embarked"].value_counts().idxmax())
    ## TODO fill Pclass and Embarked. Which on to use for each? Median or mode?

    data["Pclass"] = data["Pclass"].fillna(data["Pclass"].value_counts().idxmax())
    data["Sex"] = data["Sex"].fillna(data["Sex"].value_counts().idxmax()) 
    
    
    # Normalize so that the data values are between 0 and 1
    data["Age"] /= data["Age"].max()
    data["Embarked"] /= data["Embarked"].max()
    data["Pclass"] /= data["Pclass"].max()
    data["Sex"] /= data["Sex"].max()
    
    return data

In [None]:
dataTrain = feature_preprocess(train)
dataTest = feature_preprocess(test)
# Let's see if it comes out right
dataTrain.head(65)

In [None]:
# This function extracts 4 values from our data. We will only use these 4 to do our task
# It converts a panda dataframe into a numpy array
def extract_feature(data):
    np_data = np.array(data[["Pclass","Sex","Age","Embarked"]].values, dtype=np.float32)
    return np_data

In [None]:
Xtrain = extract_feature(dataTrain)
Ytrain = np.array(dataTrain[["Survived"]].values, dtype=np.float32).flatten()

Xtest = extract_feature(dataTest)

# See if they look right
print Xtrain
print Ytrain

In [None]:
logistic = linear_model.LogisticRegression()

# Train the logistic regression
logistic.fit(Xtrain, Ytrain)

# Use it to predict the test set
prediction = logistic.predict(Xtest)
print prediction

In [None]:
# We generate an output file output.csv
output = pd.DataFrame(test["PassengerId"])
output["Survived"] = (prediction==1).astype(int)
output.to_csv("output.csv", index=False)

To evaluate your results, we will use Kaggle. Kaggle is a website that hosts many machine learning competitions. Many companies put up their data as a problem for anyone to participate.

To submit your prediction, you must first sign-up for an account on [Kaggle.com](Kaggle). Click participate to the competition at [https://www.kaggle.com/c/titanic/](Titanic) then submit your csv file for the score.

You should get a score of 0.75598.

# Titanic Playground #
Try improving your results.
Things to try
1. Changing features, adding or removing features
2. Add non-linear features

In [None]:
# Enter your code here