In [5]:
import pandas as pd
import numpy as np
import csv
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation


# Import the csv file, and skipping the first row as it's header
train = pd.read_csv('./train2.csv', header=0)
test = pd.read_csv('./test2.csv', header=0)

# Adding for binary values of Sex (females = 0, males = 1)
train['Sex'] = train['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
test['Sex'] = test['Sex'].map( {'female': 0, 'male': 1} ).astype(int)


#Dealing with NA values of ages 
median_ages = np.zeros((2,3)) #creating array of median ages for different sex and class
for i in range(0, 2):
    for j in range(0, 3):
        median_ages[i,j] = train[(train['Sex'] == i) & (train['Pclass'] == j+1)]['Age'].dropna().median()
        #median_fare[i,j] = train[(train['Sex'] == i) & (train['Pclass'] == j+1)]['Fare'].dropna().median()

for i in range(0, 2): #filling in the NA values with appropriate values
    for j in range(0, 3):
        train.loc[ (train.Age.isnull()) & (train.Sex == i) & (train.Pclass == j+1),'Age'] = median_ages[i,j]
        test.loc[ (test.Age.isnull()) & (test.Sex == i) & (test.Pclass == j+1),'Age'] = median_ages[i,j]
        
#Dealing with NA values of fare
test.loc[ (test.Fare.isnull()),'Fare'] =  train['Fare'].mean()

# Converting fares into categorical features ($0-9 = 0,  $10-19 = 1,  $20-29 = 2,  $30-39 = 3) 
fare_ceiling = 40 #any fare that costs more than or equal to 40 will be put in the highest fare bin - fare bin 3
fare_bracket_size = 10
number_of_fares = fare_ceiling / fare_bracket_size
number_of_classes = 3 
train['BinFare'] = ((train.Fare/fare_bracket_size).clip_upper(number_of_fares-1).astype(np.int))
test['BinFare'] = ((test.Fare/fare_bracket_size).clip_upper(number_of_fares-1).astype(np.int))

# Dealing with categorical features through dummy variables
dummy_class = pd.get_dummies(train['Pclass'], prefix='class')
dummy_embarked = pd.get_dummies(train['Embarked'], prefix='embarked')
dummy_binFare = pd.get_dummies(train['BinFare'], prefix='binFare')

train_cols_to_keep = (train.columns.values)
test_cols_to_keep = (test.columns.values)

train = train[train_cols_to_keep].join(dummy_class.ix[:, :]).join(dummy_embarked.ix[:, :]).join(dummy_binFare.ix[:, :]) #Join dummy variables for embarked, class and bin fare
test = test[test_cols_to_keep].join(dummy_class.ix[:, :]).join(dummy_embarked.ix[:, :]).join(dummy_binFare.ix[:, :])

#add intercept for categorical features
train['intercept'] = 1.0 
test['intercept'] = 1.0 

# Creating target vector for training data
y_train = train['Survived'] 
test['Survived'] = 0
y_test = test[ ['PassengerId', 'Survived'] ]

# Dropping unused features
train = train.drop(['PassengerId','Survived','Ticket','Cabin','Name','Pclass', 'Embarked', 'Fare'], axis=1) #ticket and cabin features have too many NA values. Name has too many categories that cannot be reduced. Sex, Age, Pclass and Embarked are ignored post data cleaning.
test = test.drop(['PassengerId','Survived','Ticket','Cabin','Name','Pclass', 'Embarked', 'Fare'], axis=1) #ticket and cabin features have too many NA values. Name has too many categories that cannot be reduced. Sex, Age, Pclass and Embarked are ignored post data cleaning.

# Creating Logistic Regression model
model = LogisticRegression()
model.fit(train, y_train)

# Finding out accuracy via cross validation score for training data
scores = cross_validation.cross_val_score(model, train, y_train)
print(scores)
print(np.mean(scores))

#Predicting output of testing data
predicted = model.predict(test)
y_test['Survived'] = predicted

#Output CSV file
y_test.to_csv('ModernAnalytics_logisticRegression.csv', index = False)

[ 0.79461279  0.79124579  0.80808081]
0.79797979798
