In [14]:
# This script performs an exploratory data analysis of the passengers


# VARIABLE DESCRIPTIONS:
# 1  survival        Survival
#                    (0 = No; 1 = Yes)
# 2  pclass          Passenger Class
#                    (1 = 1st; 2 = 2nd; 3 = 3rd)
# 3  name            Name
# 4  sex             Sex
# 5  age             Age
# 6  sibsp           Number of Siblings/Spouses Aboard
# 7  parch           Number of Parents/Children Aboard
# 8  ticket          Ticket Number
# 9  fare            Passenger Fare
# 10 cabin           Cabin
# 11 embarked        Port of Embarkation
#                    (C = Cherbourg; Q = Queenstown; S = Southampton)

In [15]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import csv as csv
import sklearn.linear_model as lm
import sklearn.cross_validation as cv
from sklearn import preprocessing
import random

In [16]:
csv_file_object = csv.reader(open('train.csv'))       # Load in the csv file
csv_file_object
header = next(csv_file_object)                        # Skip the fist line as it is a header
data=[]                                               # Create a variable to hold the data

for row in csv_file_object:                           # Skip through each row in the csv file,
    data.append(row[0:])                              # adding each row to the data variable
data = np.array(data)                                 # Then convert from a list to an array.

In [17]:
# Select data with age
data_wa=np.squeeze(data[data[:,5].nonzero(),:])

# number of training examples
m=len(data_wa)
print('m =',m)

# number of features to be used
n=7

# Create training data set
x=np.zeros([m,n])

m = 714


In [18]:
# Copy class to training set
x[:,0]=data_wa[:,2].astype(np.float)

# Copy gender to training set
x[data_wa[:,4]=='male',1]=0.
x[data_wa[:,4]=='female',1]=1.

# Copy age to training set
x[:,2]=data_wa[:,5].astype(np.float)

# Copy sibsp to training set
x[:,3]=data_wa[:,6].astype(np.float)

# Copy parch to training set
x[:,4]=data_wa[:,7].astype(np.float)

# Copy fare to training set
x[:,5]=np.log(data_wa[:,9].astype(np.float)+0.1)

# Copy embarked to training set
x[data_wa[:,11]=='C',6]=1.
x[data_wa[:,11]=='Q',6]=2.
x[data_wa[:,11]=='S',6]=3.

In [25]:
poly = preprocessing.PolynomialFeatures(2)
xnew=poly.fit_transform(x)

In [20]:
# Features scaling
X = preprocessing.scale(x)

In [26]:
# Survivors
y=data_wa[:,1].astype(np.float)

In [27]:
# Null accuracy
1-y_train.mean()

0.61087866108786604

In [28]:
# Define logistic regression object
log_reg=lm.LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True,
                                    intercept_scaling=1, class_weight=None, random_state=None,
                                    solver='liblinear',max_iter=100, multi_class='ovr',
                                    verbose=0, warm_start=False, n_jobs=1)

In [None]:
# Play with the learning curve
from sklearn import preprocessing

In [10]:
X_train, X_test, y_train, y_test = cv.train_test_split(X, y, test_size=0.33, random_state=88)

In [13]:
# Null accuracy
1-y_train.mean()

0.61087866108786604

In [30]:
lr=lm.LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True,
                                    intercept_scaling=1, class_weight=None, random_state=None,
                                    solver='liblinear',max_iter=100, multi_class='ovr',
                                    verbose=0, warm_start=False, n_jobs=1)

In [31]:
lr.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [32]:
lr.score(X_train,y_train)

0.83472803347280333

In [33]:
scores = cv.cross_val_score(lr, X_train, y_train, cv=20)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.83 (+/- 0.16)


In [34]:
C_s = np.logspace(-10, 3, 30)