# Titanic: Machine Learning from Disaster

In [None]:
#Citation: 
#Code in this file has use Rishabh Misra's Logistic Regression code as reference. 
#link to the reference code: https://www.kaggle.com/rmisra/logistic-regression

import numpy as np
import pandas as pd
import sklearn.linear_model.logistic as logis
import csv

#load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Choose features
## Q: Using logistic regression, try to predict whether a passenger survived the disaster. You can choose the features (or combinations of features) you would like to use or ignore, provided you justify your reasoning.

### We want to use feature data to predict whether a passenger survive in the disaster. Therefore, we only want to use features which related to survival rate in disaster. 
### First, we know that ticket class and ticket fare is crucial. People who are in the first class or pay a lot to buy ticket are often the more powerful people. Age and gender are also very important in this event because we know from saying that people follow "Women and Children First" policy in this event. 
### Ticket number, passenger id and name is totally irrelevant in this question. All three features only represent ways to identify a person. Pork of embarkation is also irrelevant in this question because the disaster happended after all people embarked. Cabin data is useless because there are too many NaN values. 
### There are two family related data. One is number of siblings/spouses aboard the Titanic, and the other one is number of parents/children aborad the Titanic. I choose to put the second feature in my model because I believe the bound between parents and children are so tight that it influences people's decision even in a live or death scenario, but the relation between siblings and spouses are less significant in such severe event.  

In [79]:
#In the training data, we can see that there are many missing data, so we need to clean data into more useful ones. 

# Convert Sex feature from category into data 
train['Sex'] = train['Sex'].map( {'female':0, 'male':1} ).astype(int)

# replace missing age value with median age
median_age = train['Age'].dropna().median()
if len(train.Age[ train.Age.isnull() ]) > 0:
    train.loc[ (train.Age.isnull()), 'Age'] = median_age

# replace missing fare value with median fare
if len(train.Fare[ train.Fare.isnull() ]) > 0:
    train.loc[ (train.Fare.isnull()), 'Fare'] = train.Fare.median()

#train data
y_train = np.array(train['Survived'])
x_train = train.drop(["Survived",'Name', 'Ticket', 'Cabin', 'PassengerId', 'SibSp', 'Embarked',], axis=1).astype(np.float64)
x_train = np.array(x_train)



#Split training data into half and 
#x = x_train[0:int((x_train.shape[0])/2)]
#x2 = x_train[int((x_train.shape[0])/2):]
#y = y_train[0:int((y_train.shape[0])/2)]
#y2 = y_train[int((y_train.shape[0])/2):]


# Do logistic regression

logistic = logis.LogisticRegression()
theta = logistic.fit(x_train, y_train.flatten())

In [80]:
# Clean our test data

# Convert Sex feature from category into data 
test['Sex'] = test['Sex'].map( {'female':0, 'male':1} ).astype(int)

# replace missing age value with median age
median_age = test['Age'].dropna().median()
if len(test.Age[ test.Age.isnull() ]) > 0:
    test.loc[ (test.Age.isnull()), 'Age'] = median_age

# replace missing fare value with median fare
if len(test.Fare[ test.Fare.isnull() ]) > 0:
    test.loc[ (test.Fare.isnull()), 'Fare'] = test.Fare.median()

#test data
x_PassengerID = np.array(test['PassengerId'])
x_test = test.drop(['Name', 'Ticket', 'Cabin', 'PassengerId', 'SibSp', 'Embarked',], axis=1).astype(np.float64)
x_test = np.array(x_test)

# Fit x_test into our model to give us y
y_test = logistic.predict(x_test)

#write output data

predictions_file = open("logistic_titanic.csv", "w")
predictions_file_object = csv.writer(predictions_file)
predictions_file_object.writerow(['PassengerId', 'Survived'])	# write the column headers
for i in range(0,len(y_test)):# For each row in test file,
    predictions_file_object.writerow([str(x_PassengerID[i]), str(y_test[i])])	

## close the file
predictions_file.close()