In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import time
import itertools
import math

%matplotlib inline

## Load Training Data

In [7]:
train_data = pd.read_csv('data/train.csv')
pd.set_option('display.max_rows', 20)
display(train_data)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


## Do a Simple Analysis on the Training Data 

In [8]:
analysis = pd.DataFrame(columns=['col_name','null_num','type_num'])
for col in train_data:
    row = pd.Series({'col_name':col, 
                     'null_num':train_data[col].isnull().sum(), 
                     'type_num':train_data[col].unique().size})
    analysis = analysis.append(row, ignore_index=True)
    
display(analysis)

Unnamed: 0,col_name,null_num,type_num
0,PassengerId,0,891
1,Survived,0,2
2,Pclass,0,3
3,Name,0,891
4,Sex,0,2
5,Age,177,89
6,SibSp,0,7
7,Parch,0,7
8,Ticket,0,681
9,Fare,0,248


## Select and Convert Training Data

In [55]:
y_train = train_data['Survived']

'''
 drop 'Name', 'Ticket', 'Cabin'(too many missing), 'Embarked'(not related)
'''

x_train = train_data[['Pclass', 'SibSp', 'Parch']]


# Convert Sex feature from category into data 
x_train.insert(1, 'Sex', (train_data['Sex'].map({'female':0, 'male':1}).astype(int)))


# replace missing age value with median age
median_age = train_data['Age'].median()
print("Age Median:", median_age)
x_train.insert(2, 'Age',  train_data['Age'].fillna(median_age))


# replace missing fare value with median fare
median_fare = train_data['Fare'].median()
print("Fare Median:", median_fare)
x_train.insert(5, 'Fare',  train_data['Fare'].fillna(median_fare))

    
display(x_train)

Age Median: 28.0
Fare Median: 14.4542


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,1,22.0,1,0,7.2500
1,1,0,38.0,1,0,71.2833
2,3,0,26.0,0,0,7.9250
3,1,0,35.0,1,0,53.1000
4,3,1,35.0,0,0,8.0500
5,3,1,28.0,0,0,8.4583
6,1,1,54.0,0,0,51.8625
7,3,1,2.0,3,1,21.0750
8,3,0,27.0,0,2,11.1333
9,2,0,14.0,1,0,30.0708


## 3-Fold Cross Validation

In [62]:
from sklearn.model_selection import KFold

indices_3Fold = KFold(n_splits=3, shuffle=True, random_state=int(time.time())).split(train_data)
for train, test in indices_3Fold:
    print("%d %d" % (len(train), len(test)))
    print(test)

594 297
[  0   1   6   7   9  12  13  16  18  21  22  28  29  31  36  42  48  52
  60  66  68  76  87  89  91  94 100 108 110 116 117 128 129 130 131 132
 133 136 138 139 140 146 147 151 154 159 161 162 169 176 178 180 183 188
 190 193 195 198 200 201 205 206 207 209 214 217 219 223 224 226 228 233
 236 238 240 242 247 249 252 253 257 260 262 265 266 267 268 269 271 277
 279 280 281 282 285 289 290 292 296 298 302 303 306 307 311 315 319 322
 326 327 332 339 346 350 356 357 360 361 364 368 373 377 378 381 382 386
 389 390 404 405 407 418 419 425 433 438 441 450 451 452 455 457 458 460
 464 465 466 469 472 475 476 477 479 480 482 490 492 495 498 501 505 506
 508 509 511 519 524 525 527 539 540 541 543 544 545 549 556 558 562 563
 565 566 569 572 574 579 583 585 587 591 592 593 595 596 604 605 609 614
 624 633 636 638 641 645 646 648 650 652 656 663 664 665 667 669 672 673
 676 677 678 680 686 688 690 698 700 701 704 705 712 714 717 718 721 728
 729 732 734 735 736 737 738 740 743 744 74