In [38]:
# 데이터 확인

import numpy as np
import pandas as pd

train = pd.read_csv('./Data/titanic/train.csv') # train 데이터
test = pd.read_csv('./Data/titanic/test.csv')   # test 데이터
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [39]:
# null값이 많은 데이터 확인

train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [40]:
# Null 값 많은 Ticket, Cabin 열 삭제

train = train.drop(['Ticket','Cabin'],axis = 1) # Ticket, Cabin 열 삭제
test = test.drop(['Ticket','Cabin'],axis = 1)   # Ticket, Cabin 열 삭제
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


In [41]:
# Embarked 열의 구성 요소 확인

s = train[train["Embarked"] == "S"].shape[0]
print("S : ", s)
c = train[train["Embarked"] == "C"].shape[0]
print("C : ", c)
q = train[train["Embarked"] == "Q"].shape[0]
print("Q : ", q)

S :  644
C :  168
Q :  77


In [42]:
# Embarked에서 s가 가장 많으므로 NaN값을 S로 대체

train = train.fillna({"Embarked" : "S"})

# Embarked의 S,Q,C를 1,2,3으로 대체
embarked_mapping = {"S":1,"C":2,"Q":3}
train["Embarked"] = train["Embarked"].map(embarked_mapping)
test["Embarked"] = test["Embarked"].map(embarked_mapping)

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,1


In [43]:
# Name열 문자열 파싱

combine = [train, test]

for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.',expand= False)

# Name열 파싱을 Sex열과 함께 나열
pd.crosstab(train['Title'],train['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Countess,1,0
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0


In [44]:
# Name열 구성요소를 6개로 대체

for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady','Capt','Col','Don','Dr','Major','Rev','Jonkheer','Dona'],'Rare')
    
    dataset['Title'] = dataset['Title'].replace(['Countess','Lady','Sir'],'Royal')
    dataset['Title'] = dataset['Title'].replace('Mile','Miss')
    dataset['Title'] = dataset['Title'].replace('Ms','Miss')
    dataset['Title'] = dataset['Title'].replace('Mme','Mrs')

train[['Title','Survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,Survived
0,Master,0.575
1,Miss,0.699454
2,Mlle,1.0
3,Mr,0.156673
4,Mrs,0.793651
5,Rare,0.285714
6,Royal,1.0


In [45]:
# Name열 구성요소를 6개로 대체

title_mapping = {"Mr":1,"Miss":2,"Mrs":3,"Master":4,"Royal":5,"Rare":6}
for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,1,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,2,3.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,1,2.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,1,3.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,1,1.0


In [46]:
# Name, PassenerId 열 삭제

train = train.drop(['Name','PassengerId'],axis =1)
test = test.drop(['Name','PassengerId'],axis =1)
combine = [train,test]
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,male,22.0,1,0,7.25,1,1.0
1,1,1,female,38.0,1,0,71.2833,2,3.0
2,1,3,female,26.0,0,0,7.925,1,2.0
3,1,1,female,35.0,1,0,53.1,1,3.0
4,0,3,male,35.0,0,0,8.05,1,1.0


In [47]:
# Sex열의 구성요소를 0,1로 대체

sex_mapping = {"male":0,"female":1}
for dataset in combine :
    dataset['Sex'] = dataset["Sex"].map(sex_mapping)
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,22.0,1,0,7.25,1,1.0
1,1,1,1,38.0,1,0,71.2833,2,3.0
2,1,3,1,26.0,0,0,7.925,1,2.0
3,1,1,1,35.0,1,0,53.1,1,3.0
4,0,3,0,35.0,0,0,8.05,1,1.0


In [49]:
# Age 값 정리
train['Age'] = train['Age'].fillna(-0.5)
test['Age'] = test['Age'].fillna(-0.5)
bins = [-1, 0, 5, 12, 18, 24, 35, 60, np.inf]
labels = ['Unknown','Baby','Child','Teenager','Student','Young Adult','Adult','Senior']

# Age 값 정리 : Cut 함수로 각 구간을 득정 값으로 정의
train['AgeGroup'] = pd.cut(train["Age"],bins, labels= labels)
test['AgeGroup'] = pd.cut(test["Age"],bins, labels=labels)
print(train.head())

   Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked  Title  \
0         0       3    0  22.0      1      0   7.2500         1    1.0   
1         1       1    1  38.0      1      0  71.2833         2    3.0   
2         1       3    1  26.0      0      0   7.9250         1    2.0   
3         1       1    1  35.0      1      0  53.1000         1    3.0   
4         0       3    0  35.0      0      0   8.0500         1    1.0   

      AgeGroup  
0      Student  
1        Adult  
2  Young Adult  
3  Young Adult  
4  Young Adult  


In [50]:
age_title_mapping = {1:"Young Adult",2:"Student",3:"Adult",4:"Baby",5:"Adult",6:"Adult",}
for x in range(len(train["AgeGroup"])):
    if train["AgeGroup"][x] == "Unknown" :
        train["AgeGroup"][x] = age_title_mapping[train["Title"][x]]

for x in range(len(test["AgeGroup"])):
    if test["AgeGroup"][x] == "Unknown" :
        test["AgeGroup"][x] = age_title_mapping[test["Title"][x]]

In [51]:
age_mappaing = {"Baby":1,"Child":2,"Teenager":3,"Student":4,"Young Adult":5,"Adult":6,"Senior":7}
train['AgeGroup'] = train['AgeGroup'].map(age_mappaing)
test['AgeGroup'] = test['AgeGroup'].map(age_mappaing)

train = train.drop(['Age'],axis =1)
test = test.drop(['Age'],axis =1)
print(train.head())

   Survived  Pclass  Sex  SibSp  Parch     Fare  Embarked  Title  AgeGroup
0         0       3    0      1      0   7.2500         1    1.0         4
1         1       1    1      1      0  71.2833         2    3.0         6
2         1       3    1      0      0   7.9250         1    2.0         5
3         1       1    1      1      0  53.1000         1    3.0         5
4         0       3    0      0      0   8.0500         1    1.0         5


In [52]:
# Fare 4개의 범위로 구분

train['FareBand'] = pd.qcut(train['Fare'], 4, labels= [1,2,3,4])
test['FareBand'] = pd.qcut(train['Fare'],4, labels=[1,2,3,4])

train = train.drop(['Fare'],axis = 1)
test = test.drop(['Fare'],axis=1)

train.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Embarked,Title,AgeGroup,FareBand
0,0,3,0,1,0,1,1.0,4,1
1,1,1,1,1,0,2,3.0,6,4
2,1,3,1,0,0,1,2.0,5,2
3,1,1,1,1,0,1,3.0,5,4
4,0,3,0,0,0,1,1.0,5,2
