# Titanic data 분류 공부


In [93]:
import pandas as pd
import seaborn as sns
import numpy as np

df = sns.load_dataset('titanic')
print(df.head())
print(df.info())

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
survived       891 non-null int64
pclass         891 non-null int64
sex            891 non-null object
age        

In [94]:
#  IPython 디스플레이 설정 - 출력할 열의 개수 한도 늘리기
pd.set_option('display.max_columns', 15)
print(df.head())   
print('\n')

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  




In [95]:
# NaN값이 많은 deck 열을 삭제, embarked와 내용이 겹치는 embark_town 열을 삭제
rdf = df.drop(['deck', 'embark_town'], axis=1)  
print(rdf.columns.values)
print('\n')

['survived' 'pclass' 'sex' 'age' 'sibsp' 'parch' 'fare' 'embarked' 'class'
 'who' 'adult_male' 'alive' 'alone']




In [96]:
# 2. Impute some missing values, grouped by their Pclass and SibSp numbers
rdf_grouped = rdf.groupby(by=['pclass','sibsp'])
# # now use this grouping to fill the data set in each group, then transform back
# fill in the numeric values
rdf_imputed = rdf_grouped.apply(lambda x: x.fillna(x.median()))

In [97]:
#여전히 age값이 nan인 애들이 있음. 그네들을 그냥 지움. 
rdf_imputed = rdf_imputed.dropna(subset=['age'], how='any', axis=0)
rdf_imputed.info()
rdf=rdf_imputed.copy()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 884 entries, 0 to 890
Data columns (total 13 columns):
survived      884 non-null int64
pclass        884 non-null int64
sex           884 non-null object
age           884 non-null float64
sibsp         884 non-null int64
parch         884 non-null int64
fare          884 non-null float64
embarked      882 non-null object
class         884 non-null category
who           884 non-null object
adult_male    884 non-null bool
alive         884 non-null object
alone         884 non-null bool
dtypes: bool(2), category(1), float64(2), int64(4), object(4)
memory usage: 78.7+ KB


In [98]:
# embarked 열의 NaN값을 승선도시 중에서 가장 많이 출현한 값으로 치환하기
most_freq = rdf['embarked'].value_counts(dropna=True).idxmax()   
print(most_freq)
print('\n')

print(rdf.describe(include='all'))
print('\n')

rdf['embarked'].fillna(most_freq, inplace=True)
rdf.isnull().sum()

S


          survived      pclass   sex         age       sibsp       parch  \
count   884.000000  884.000000   884  884.000000  884.000000  884.000000   
unique         NaN         NaN     2         NaN         NaN         NaN   
top            NaN         NaN  male         NaN         NaN         NaN   
freq           NaN         NaN   573         NaN         NaN         NaN   
mean      0.386878    2.303167   NaN   29.282432    0.463801    0.368778   
std       0.487311    0.837101   NaN   13.308469    0.882600    0.796210   
min       0.000000    1.000000   NaN    0.420000    0.000000    0.000000   
25%       0.000000    2.000000   NaN   22.000000    0.000000    0.000000   
50%       0.000000    3.000000   NaN   26.000000    0.000000    0.000000   
75%       1.000000    3.000000   NaN   37.000000    1.000000    0.000000   
max       1.000000    3.000000   NaN   80.000000    5.000000    6.000000   

              fare embarked  class  who adult_male alive alone  
count   884.000000

survived      0
pclass        0
sex           0
age           0
sibsp         0
parch         0
fare          0
embarked      0
class         0
who           0
adult_male    0
alive         0
alone         0
dtype: int64

In [99]:
# 3. Computed discrete features agains now with the newest values
rdf['age_range'] = pd.cut(rdf.age,[0,16,24,65,1e6],4,labels=[0,1,2,3]) # this creates a new variable
rdf.age_range

0      1
1      2
2      2
3      2
4      2
      ..
886    2
887    1
888    2
889    2
890    2
Name: age_range, Length: 884, dtype: category
Categories (4, int64): [0 < 1 < 2 < 3]

In [100]:
# 분석에 활용할 열(속성)을 선택 
ndf = rdf[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'embarked','fare','age_range']]
print(ndf.head())   
print('\n')

   survived  pclass     sex   age  sibsp  parch embarked     fare age_range
0         0       3    male  22.0      1      0        S   7.2500         1
1         1       1  female  38.0      1      0        C  71.2833         2
2         1       3  female  26.0      0      0        S   7.9250         2
3         1       1  female  35.0      1      0        S  53.1000         2
4         0       3    male  35.0      0      0        S   8.0500         2




Now let's look a little further at each of the categorical objects. Note that age range has already been saved as an ordinal. We need to look at Sex and Embarked objects. 

In [101]:
ndf[['sex','embarked']].describe().T # making table like.

Unnamed: 0,count,unique,top,freq
sex,884,2,male,573
embarked,884,3,S,639


In [102]:
# onehot incoding
# onehot_isMale = pd.get_dummies(ndf['sex'])
# ndf = pd.concat([ndf,onehot_isMale],axis=1)

onehot_embarked = pd.get_dummies(ndf['embarked'], prefix='Embarked')
ndf = pd.concat([ndf,onehot_embarked],axis=1)

ndf['isMale'] = ndf.sex =='male'
ndf.isMale=ndf.isMale.astype(np.int)
# ndf.sex= ndf.sex.map({'male':1,'female':0})
ndf.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,embarked,fare,age_range,Embarked_C,Embarked_Q,Embarked_S,isMale
0,0,3,male,22.0,1,0,S,7.25,1,0,0,1,1
1,1,1,female,38.0,1,0,C,71.2833,2,1,0,0,0
2,1,3,female,26.0,0,0,S,7.925,2,0,0,1,0
3,1,1,female,35.0,1,0,S,53.1,2,0,0,1,0
4,0,3,male,35.0,0,0,S,8.05,2,0,0,1,1


In [103]:
if 'sex' in ndf:
    del ndf['sex']

if 'embarked' in ndf:
    del ndf['embarked']
    
ndf.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,age_range,Embarked_C,Embarked_Q,Embarked_S,isMale
0,0,3,22.0,1,0,7.25,1,0,0,1,1
1,1,1,38.0,1,0,71.2833,2,1,0,0,0
2,1,3,26.0,0,0,7.925,2,0,0,1,0
3,1,1,35.0,1,0,53.1,2,0,0,1,0
4,0,3,35.0,0,0,8.05,2,0,0,1,1


In [104]:
ndf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 884 entries, 0 to 890
Data columns (total 11 columns):
survived      884 non-null int64
pclass        884 non-null int64
age           884 non-null float64
sibsp         884 non-null int64
parch         884 non-null int64
fare          884 non-null float64
age_range     884 non-null category
Embarked_C    884 non-null uint8
Embarked_Q    884 non-null uint8
Embarked_S    884 non-null uint8
isMale        884 non-null int32
dtypes: category(1), float64(2), int32(1), int64(4), uint8(3)
memory usage: 55.4 KB


In [106]:
# adding moe more varibale
ndf['familySize'] = ndf.parch + ndf.sibsp
ndf.head()
ndf.values

array([[0, 3, 22.0, ..., 1, 1, 1],
       [1, 1, 38.0, ..., 0, 0, 1],
       [1, 3, 26.0, ..., 1, 0, 0],
       ...,
       [0, 3, 25.0, ..., 1, 0, 3],
       [1, 1, 26.0, ..., 0, 1, 0],
       [0, 3, 32.0, ..., 0, 1, 0]], dtype=object)

### Training and Testing Split

For training and testing purposes, let's gather the data we have and grab 80% of the instances for training and the remaining 20% for testing. Moreover, let's repeat this process of separating the testing and training data three times. We will use the hold out cross validation method built into scikit-learn.

In [None]:
from sklearn.model_selection import ShuffleSplit

# we want to predict the X and y data as follows:
if 'survived' in ndf:
    y = ndf['survived'].values # get the labels we want
    del ndf['survived'] # get rid of the class label
    X = df.values # use everything else to predict!

    ## X and y are now numpy matrices, by calling 'values' on the pandas data frames we
    #    have converted them into simple matrices to use with scikit learn
    
    
# to use the cross validation object in scikit learn, we need to grab an instance
#    of the object and set it up. This object will be able to split our data into 
#    training and testing splits
num_cv_iterations = 3
num_instances = len(y)
cv_object = ShuffleSplit(n_splits=num_cv_iterations,
                         test_size  = 0.2)
                         
print(cv_object)