In [1471]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import model_selection
import xgboost as xgb

In [1472]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [1473]:
print train.shape, test.shape

(891, 12) (418, 11)


In [1474]:
Y_train = train.Survived
test_id = test.PassengerId

In [1475]:
train_test = pd.concat([train, test])
train_test = train_test.drop(['PassengerId', 'Survived', 'Name', 'Cabin', 'Ticket'], axis = 1)
print 'shape: '
print train_test.shape, train.shape, test.shape
print '~~~~~~~~~~~~~'
print 'type'
print train_test.dtypes
print '~~~~~~~~~~~~~'
print 'No. of Nans'
print train_test.isnull().sum()
print '~~~~~~~~~~~~~'
train_test.head(2)

shape: 
(1309, 7) (891, 12) (418, 11)
~~~~~~~~~~~~~
type
Age         float64
Embarked     object
Fare        float64
Parch         int64
Pclass        int64
Sex          object
SibSp         int64
dtype: object
~~~~~~~~~~~~~
No. of Nans
Age         263
Embarked      2
Fare          1
Parch         0
Pclass        0
Sex           0
SibSp         0
dtype: int64
~~~~~~~~~~~~~


Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp
0,22.0,S,7.25,0,3,male,1
1,38.0,C,71.2833,0,1,female,1


In [1476]:
print 'fill NaNs with mean or mode for object type'
train['Age'].fillna((train['Age'].mean()), inplace=True)
train['Fare'].fillna((train['Fare'].mean()), inplace=True)
train['Embarked'].fillna((train.Embarked.value_counts()[0]), inplace=True)
test['Age'].fillna((test['Age'].mean()), inplace=True)
test['Fare'].fillna((test['Fare'].mean()), inplace=True)
test['Embarked'].fillna((test.Embarked.value_counts()[0]), inplace=True)
train_test = pd.concat([train, test])
train_test = train_test.drop(['PassengerId', 'Survived', 'Name', 'Cabin', 'Ticket'], axis = 1)

print 'No. of Nans'
print train_test.isnull().sum()
train_test.head(2)

fill NaNs with mean or mode for object type
No. of Nans
Age         0
Embarked    0
Fare        0
Parch       0
Pclass      0
Sex         0
SibSp       0
dtype: int64


Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp
0,22.0,S,7.25,0,3,male,1
1,38.0,C,71.2833,0,1,female,1


In [1477]:
print 'process object types: ["Embarked", "Sex"]'
train_test['Embarked'] = train_test['Embarked'].factorize()[0]
train_test = pd.concat([train_test, pd.get_dummies(train_test.Sex)], axis = 1)
train_test = train_test.drop(['Sex'], axis = 1)
print train_test.dtypes
train_test.head()

process object types: ["Embarked", "Sex"]
Age         float64
Embarked      int64
Fare        float64
Parch         int64
Pclass        int64
SibSp         int64
female        uint8
male          uint8
dtype: object


Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,SibSp,female,male
0,22.0,0,7.25,0,3,1,0,1
1,38.0,1,71.2833,0,1,1,1,0
2,26.0,0,7.925,0,3,0,1,0
3,35.0,0,53.1,0,1,1,1,0
4,35.0,0,8.05,0,3,0,0,1


In [1478]:
train_test.describe()

Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,SibSp,female,male
count,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0
mean,29.882243,0.398778,33.297261,0.385027,2.294882,0.498854,0.355997,0.644003
std,12.883758,0.6612,51.738919,0.86556,0.837836,1.041658,0.478997,0.478997
min,0.17,0.0,0.0,0.0,1.0,0.0,0.0,0.0
25%,22.0,0.0,7.8958,0.0,2.0,0.0,0.0,0.0
50%,29.699118,0.0,14.4542,0.0,3.0,0.0,0.0,1.0
75%,35.0,1.0,31.275,0.0,3.0,1.0,1.0,1.0
max,80.0,3.0,512.3292,9.0,3.0,8.0,1.0,1.0


In [1479]:
def normalize(columns):
    for col in columns:
        mu = train[col].mean()
        sigma = train[col].std()
        train_test[col] = (train[col] - train[col].mean())/train[col].std()

In [1480]:
print 'normalize columns ["Age", "Fare"]: '
columns_to_normalize = ['Age', 'Fare']
normalize(columns_to_normalize)
train_test.describe()

normalize columns ["Age", "Fare"]: 


Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,SibSp,female,male
count,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0
mean,-0.02006238,0.398778,0.002039,0.385027,2.294882,0.498854,0.355997,0.644003
std,0.9919921,0.6612,0.997283,0.86556,0.837836,1.041658,0.478997,0.478997
min,-2.251891,0.0,-0.648058,0.0,1.0,0.0,0.0,0.0
25%,-0.592148,0.0,-0.48858,0.0,2.0,0.0,0.0,0.0
50%,4.371893e-15,0.0,-0.35719,0.0,3.0,0.0,0.0,1.0
75%,0.407697,1.0,-0.024233,0.0,3.0,1.0,1.0,1.0
max,3.868699,3.0,9.66174,9.0,3.0,8.0,1.0,1.0


In [1481]:
X_train = train_test[:len(train)]
X_test = train_test[len(train):]
print X_train.shape, X_test.shape

(891, 8) (418, 8)


In [1482]:
def sigmoid(x):
    x = np.array(x)
    return 1/(1+np.exp(-x))

In [1483]:
def logistic_regression_model(X_train, Y_train, theta, la):
    # return cost J and gradient of theta in grad
    m = len(Y_train)
    X = X_train.copy()
    X.insert(0, 'Bias', 1)
    X = np.array(X)
    Y = np.array(Y_train)
    h_theta = sigmoid(np.dot(X, theta))
    J = -(np.dot(Y, np.log(h_theta)) + np.dot(1-Y, np.log(1-h_theta))) / m + np.dot(theta[1:], theta[1:])*la / (2*m)
    grad = np.dot(X.transpose(), h_theta - Y) / m + theta*la / m
    grad[0] = grad[0] - theta[0]*la / m     
    return [J, grad]

In [1484]:
def logistic_regression_train(X_train, Y_train, theta, la, alpha, epsilon = 10.0**(-6), max_iterators = 5000):
    # return cost J and optimal theta 
    [J, grad] = logistic_regression_model(X_train, Y_train, theta, la)
    cost = [J]
    for i in range(max_iterators):
        theta = theta - alpha * grad
        [J, grad] = logistic_regression_model(X_train, Y_train, theta, la)
        if abs(cost[-1] - J) < epsilon:
            break
        cost.append(J)
        
    plt.plot(cost)
    plt.title('alpha = %r, lambda = %r'% (alpha, la))
    plt.xlabel('No. of iterations')
    plt.ylabel('Cost function J')
    plt.show()
    
    return [J, theta]

In [1485]:
print X_train.shape, Y_train.shape
X_train, X_cv, Y_train, Y_cv = sklearn.model_selection.train_test_split(X_train, Y_train, test_size=0.3)
print X_train.shape, Y_train.shape
print X_cv.shape, Y_cv.shape

(891, 8) (891,)
(623, 8) (623,)
(268, 8) (268,)


In [None]:
def logistic_regression_cross_validation(X_train, X_cv, Y_train, Y_cv, alphas, lambdas):
    init_theta = np.zeros(X_train.shape[1] + 1)
    J_train = len(lambdas)*[0]
    J_cv = len(lambdas)*[0]
    thetas = []
    for i in range(len(lambdas)):
        [J, theta] = logistic_regression_train(X_train, Y_train, init_theta, lambdas[i], alpha)
        [J_train[i], grad_train] = logistic_regression_model(X_train, Y_train, theta, 0)
        [J_cv[i], grad_cv] = logistic_regression_model(X_cv, Y_cv, theta, 0)
        thetas.append(theta)
    idx = J_cv.index(min(J_cv))
    lambda_best = lambdas[idx]
    theta_best = thetas[idx]
    
    plot_train, = plt.plot(lambdas, J_train, label='J_train')
    plot_cv, = plt.plot(lambdas, J_cv, label='J_cv')
    plt.title('Cross Validation with alpha')
    plt.xlabel('lambda')
    plt.ylabel('Cost function J')
    plt.legend([plot_train, plot_cv], ["J_train", "J_cv"])
    plt.show()
    
    return [lambda_best, theta_best]

In [None]:
print 'look for best alpha:'
alphas = [0.01*2**i for i in range(8)]
lambdas = [0,0.001, 0.003, 0.01, 0.03, 0.1, 0.2, 0.4, 0.6, 0.8, 1, 2, 3, 4, 5]
lambdas_best = []
thetas_best = []
for alpha in alphas:
    [lambda_best, theta_best] =logistic_regression_cross_validation(X_train, X_cv, Y_train, Y_cv, alpha, lambdas)
    lambdas_best.append(lambda_best)
    thetas_best.append(theta_best)
    print 'best alpha = %r, lambda = %r' % (alpha, lambda_best)

look for best alpha:


In [None]:
def logistic_regression_predict(theta, X_test):
    X = X_test.copy()
    X.insert(0, 'Bias', 1)
    X = np.array(X)
    predict = sigmoid(np.dot(X, theta))
    return predict

In [None]:
print 'predict result:'
predict = logistic_regression_predict(theta_best, X_test)
y_predict = [1 if p > 0.5 else 0 for p in predict]
print len(test_id), len(y_predict)
output = pd.DataFrame({'PassengerId': test_id, 'Survived': y_predict})
output.to_csv('Titanic_03-22-2018_version1.csv', index=False)
output.head()