In [29]:
# Standard
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import seaborn as sns
import csv

# Machine Learning
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm

%matplotlib inline

In [30]:
# Return a prepared dataframe with passenger ids
def prepare_data(df):
    prep_df = pd.DataFrame(df)
    # Binary for gender
    prep_df['Gender'] = df['Sex'].map({'female' : 0, 'male' : 1}).astype(int)
    
    # Enumerate ports
    if len(df.Embarked[df.Embarked.isnull()]) > 0:
        df.Embarked[df.Embarked.isnull()] = df.Embarked.dropna().mode().values
    ports = list((enumerate(np.unique(df.Embarked))))
    ports_dict = {name : i for i, name in ports}
    prep_df['Embarked'] = df.Embarked.map(lambda x: ports_dict[x]).astype(int)
    
    #See who is alone and who isn't
    df['Alone'] = df.SibSp + df.Parch
    prep_df['Alone'] = np.where(df['Alone'] == 0, 1, 0)
    
    # Fill in missing age values
    num_samples = len(df.Age[df.Age.isnull()])
    prep_df.Age[prep_df.Age.isnull()] = df.Age.dropna().sample(n=num_samples).values
    
    # Fill in missing fare values
    num_fare_samples = len(df.Fare[df.Fare.isnull()])
    prep_df.Fare[prep_df.Fare.isnull()] = df.Fare.dropna().sample(n=num_fare_samples).values
    
    # Create a new feature
    prep_df['Age*Class'] = df.Age * df.Pclass
    
    prep_df['Class1'] = np.where(prep_df['Pclass'] == 1,1,0)
    prep_df['Class2'] = np.where(prep_df['Pclass'] == 2,1,0)
    prep_df['Class3'] = np.where(prep_df['Pclass'] == 3,1,0)
    
    # Normalize Age
    prep_df.Age = preprocessing.scale(df.Age)
    
    # Normalize Fare
    prep_df.Fare = preprocessing.scale(df.Fare)
    prep_df['Age*Class'] = preprocessing.scale(prep_df['Age*Class'])

    # Drop columns
    prep_df = prep_df.drop(['Name','Sex','Ticket','Cabin','Pclass'],axis=1)
    
    return prep_df
    
    
    

In [31]:
train_data = pd.read_csv('titanic_train.csv')
train_df = train_data[0:800]
cv_df = train_data[800:891]
test_df = pd.read_csv('titanic_test.csv')

In [32]:
train_df = prepare_data(train_df)
cv_df = prepare_data(cv_df)
test_df = prepare_data(test_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata

In [33]:
# Don't need ids for training
train_df = train_df.drop('PassengerId', axis=1)
train_df = train_df.values

In [34]:
# Prep cv
cv_df = cv_df.drop('PassengerId', axis=1)
cv_df = cv_df.values

In [43]:
forest = RandomForestClassifier(n_estimators=100)
best_forest = forest
best_rfc_acc = 0;
# Check rfc
for i in range (0,20):
    forest = forest.fit(train_df[0::,1::],train_df[0::,0])
    output_rfc = forest.predict(cv_df[0::,1::]).astype(int)
    rfc_acc = np.mean(output_rfc == cv_df[0::,0])
    if rfc_acc > best_rfc_acc:
        best_forest = forest
        best_rfc_acc = rfc_acc
output_rfc = best_forest.predict(cv_df[0::,1::]).astype(int)
rfc_acc = np.mean(output_rfc == cv_df[0::,0])
print('rfc Classifier train: %f' % rfc_acc)
print('rfc Classifier cv: %f' % best_rfc_acc)

rfc Classifier train: 0.835165
rfc Classifier cv: 0.846154


In [36]:
# svm
svmalgo = svm.SVC()
svmalgo = svmalgo.fit(train_df[0::,1::],train_df[0::,0])
output_svm = svmalgo.predict(cv_df[0::,1::]).astype(int)
svm_acc = np.mean(output_svm == cv_df[0::,0])
print('Accuracy SVM Classifier %f' % svm_acc)

Accuracy SVM Classifier 0.846154


In [37]:
# log reg
logreg = LogisticRegression(random_state=1).fit(train_df[0::,1::],train_df[0::,0])          
output_logreg = logreg.predict(cv_df[0::,1::]).astype(int)
logreg_acc = np.mean(output_logreg == cv_df[0::,0])
print('Accuracy Logistic Regression Classifier %f' % logreg_acc)

Accuracy Logistic Regression Classifier 0.824176


In [38]:
# Save training ids
ids = test_df['PassengerId']
test_df = test_df.drop('PassengerId',axis=1)

In [39]:
test_df = test_df.values

In [40]:
# Test the data
test_output_svm = svmalgo.predict(test_df).astype(int)
test_output_rfc = forest.predict(test_df).astype(int)
test_output_logreg = logreg.predict(test_df).astype(int)

In [41]:
prediction_file_svm = open('svm_model.csv','w')
prediction_file_rfc = open('random_forest_model.csv','w')
prediction_file_logreg = open('log_reg_model.csv','w')
p_svm = csv.writer(prediction_file_svm)
p_rfc = csv.writer(prediction_file_rfc)
p_logreg = csv.writer(prediction_file_logreg)

In [42]:
p_svm.writerow(['PassengerId','Survived'])
p_rfc.writerow(['PassengerId','Survived'])
p_logreg.writerow(['PassengerId','Survived'])
p_svm.writerows(zip(ids,test_output_svm))
p_rfc.writerows(zip(ids,test_output_rfc))
p_logreg.writerows(zip(ids,test_output_logreg))
prediction_file_svm.close()
prediction_file_rfc.close()
prediction_file_logreg.close()