In [102]:
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, roc_curve, auc, accuracy_score
from sklearn.tree import DecisionTreeClassifier
import pydotplus

In [103]:
train = pd.read_csv('./data/train.csv')
test =pd.read_csv('./data/test.csv')

In [104]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Data exploration

In [105]:
print('train shape: ', train.shape)
print('test shape: ', test.shape)

train shape:  (891, 12)
test shape:  (418, 11)


In [106]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [107]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [108]:
full_df = pd.concat([train,test], axis=0, ignore_index=True)
full_df.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450


In [109]:
full_df.shape

(1309, 12)

In [110]:
full_df.isnull().sum()

Age             263
Cabin          1014
Embarked          2
Fare              1
Name              0
Parch             0
PassengerId       0
Pclass            0
Sex               0
SibSp             0
Survived        418
Ticket            0
dtype: int64

## 1. Embarked 출발지

In [111]:
full_df.Embarked.value_counts()

S    914
C    270
Q    123
Name: Embarked, dtype: int64

In [112]:
# because most people are from S
full_df.Embarked = full_df.Embarked.fillna('S')

In [113]:
full_df.Embarked.isnull().sum()

0

## 2. Carbin

In [114]:
full_df.drop(['Cabin'],axis=1,inplace=True)

## 3. Age

In [115]:
full_df.Age.describe()



count    1046.000000
mean       29.881138
std        14.413493
min         0.170000
25%              NaN
50%              NaN
75%              NaN
max        80.000000
Name: Age, dtype: float64

In [116]:
full_df.Age = full_df.Age.fillna(full_df.Age.mean())

In [117]:
full_df.dtypes

Age            float64
Embarked        object
Fare           float64
Name            object
Parch            int64
PassengerId      int64
Pclass           int64
Sex             object
SibSp            int64
Survived       float64
Ticket          object
dtype: object

In [118]:
full_df.columns

Index(['Age', 'Embarked', 'Fare', 'Name', 'Parch', 'PassengerId', 'Pclass',
       'Sex', 'SibSp', 'Survived', 'Ticket'],
      dtype='object')

# Precessing Data

In [119]:
target = train.Survived
target.shape

(891,)

In [120]:
# select usaful feature columns
full_df = full_df[['Age', 'Embarked', 'Fare', 'Parch', 'Pclass','Sex', 'SibSp']]

In [121]:
train_df = full_df[:train.shape[0]]

In [122]:
train_df.shape

(891, 7)

## 1. Sex, Embarked  -> 0,1 Labelencode

In [124]:
train_df['Sex'] = train_df['Sex'].map({'male':1,'female':0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [128]:
train_df['Embarked'] = train_df['Embarked'].map({'C':0,'Q':1,'S':2})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


-----
**train_df.Embarked vs train_df['Embarked'] 차이점**

```train_df['Embarked'] = train_df['Embarked'].map({'C':0,'Q':1,'S':2})```
-> int64

```train_df.Embarked = train_df.Embarked.map({'C':0,'Q':1,'S':2})```
-> float64

-----

In [129]:
train_df.dtypes

Age         float64
Embarked      int64
Fare        float64
Parch         int64
Pclass        int64
Sex           int64
SibSp         int64
dtype: object

In [131]:
train_df.head()

Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp
0,22.0,2,7.25,0,3,1,1
1,38.0,0,71.2833,0,1,0,1
2,26.0,2,7.925,0,3,0,0
3,35.0,2,53.1,0,1,0,1
4,35.0,2,8.05,0,3,1,0
