In [5]:
# 라이브러리 불러오기
import pandas as pd
import numpy as np
import re
import sklearn
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

import warnings
warnings.filterwarnings('ignore')

# 5개 베이스 모델 사용
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier)

from sklearn.svm import SVC
from sklearn.model_selection import KFold

In [3]:
! pip install plotly



# Feature Exploration, Engineering and Cleaning

In [20]:
# 데이터셋 불러오기
train = pd.read_csv("Dataset/Titanic/train.csv")
test= pd.read_csv("Dataset/Titanic/test.csv")

# passengerID 는 따로 변수에 저장
PassengerId= test['PassengerId']

train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


## Feature Engineering

In [21]:
full_data = [train,test]

In [22]:
# 추가한 피쳐들
# 이름의 길이
train['Name_length'] = train['Name'].apply(len)
test['Name_length'] = test['Name'].apply(len)

# cabin 값 있으면 따로 컬럼으로 표시
train['Has_Cabin'] = train['Cabin'].apply(lambda x: 0 if type(x)==float else 1)
test['Has_Cabin'] = test['Cabin'].apply(lambda x: 0 if type(x) ==float else 1)

In [23]:
# FamilySize 컬럼 만들기
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

# 혼자 왔으면 IsAlone 1 대입
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize']==1, 'IsAlone'] =1

# Embarked 결측치 채우기
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')

In [24]:
# Fare 컬럼 결측치 중앙값으로 채우기
for dataset in full_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())
train['CategoricalFare'] =pd.cut(train['Fare'],4)

In [25]:
# Age 나눠서 범주형 피쳐 만들기 (결측치는 평균과 분산을 이용해 분포에 맞춰 랜덤값 채우기)
for dataset in full_data:
    age_avg = dataset['Age'].mean()
    age_std= dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_list=np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)

train['CategoricalAge'] = pd.cut(train['Age'],5)

# 이름에서 title 추출하는 함수 만들기
def get_title(name):
    title_search = re.search(" ([A-Za-z]+)\.", name)
    # title 있으면 추출하고 return
    if title_search:
        return title_search.group(1)
    return ""

# Title 피쳐 만들기
for dataset in full_data:
    dataset['Title'] =dataset['Name'].apply(get_title)

# Title 중 마이너들은 Rare로 변경
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady','Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle','Miss')
    dataset['Title'] = dataset['Title'].replace("Ms",'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme','Mrs')

for dataset in full_data:
    # 성별 매핑
    dataset['Sex']= dataset['Sex'].map({'female': 0, 'male': 1}).astype(int)

    # Title 매핑
    title_mapping= {"Mr":1, "Miss":2,'Mrs':3, 'Master':4, "Rare":5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)
    
    # Embarked 매핑
    dataset['Embarked'] = dataset['Embarked'].map({'S':0, 'C':1, 'Q':2}).astype(int)
    
    # Fare 매핑
    dataset.loc[dataset['Fare']<=7.91 ,'Fare']  = 0 
    dataset.loc[(dataset['Fare']>7.91)&(dataset['Fare']<=14.45), 'Fare'] = 1
    dataset.loc[(dataset['Fare']>14.45)&(dataset['Fare']<=31), 'Fare'] =2
    dataset.loc[dataset['Fare']>31, 'Fare'] =3
    dataset['Fare'] = dataset['Fare'].astype(int)
    
    # Age 매핑
    dataset.loc[dataset['Age']<=16, 'Age'] =0
    dataset.loc[(dataset['Age']>16)&(dataset['Age']<=32), 'Age'] =1
    dataset.loc[(dataset['Age']>32)&(dataset['Age']<=48), 'Age'] =2
    dataset.loc[(dataset['Age']>48)&(dataset['Age']<=64), 'Age'] =3
    dataset.loc[dataset['Age']>64,'Age'] =4
    
    

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:95% !important;}</style>"))