## 데이터 전처리 연습 문제
타이타닉 데이터를 전처리하라. (titanic_train.csv)

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [2]:
titanic_df = pd.read_csv("titanic_train.csv")
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 미싱 데이터 처리

In [3]:
titanic_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
age_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
_age = titanic_df['Age'].values
age_imputer.fit(_age.reshape(-1, 1))
titanic_df['Age'] = age_imputer.transform(_age.reshape(-1, 1))
titanic_df['Age']

0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: Age, Length: 891, dtype: float64

In [5]:
embarked_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
_embarked = titanic_df['Embarked'].values
embarked_imputer.fit(_embarked.reshape(-1, 1))
titanic_df['Embarked'] = embarked_imputer.transform(_embarked.reshape(-1, 1))
titanic_df['Embarked']

0      S
1      C
2      S
3      S
4      S
      ..
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 891, dtype: object

In [6]:
titanic_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [12]:
x_0 = titanic_df.drop(['PassengerId', 'Survived', 'Cabin', 
                     'Ticket', 'Name', 'Embarked', 'Sex'],
                    axis=1).values
y = titanic_df['Survived'].values
x_0.shape

(891, 5)

In [None]:
# Pclass, Age, SibSp, Parch, Fare 값의 스탠다드 스케일러 처리
titanic_sc = StandardScaler()
titanic_sc.fit(x_0)
x_0 = titanic_sc.transform(x_0)
x_0

In [None]:
# 'Sex'
sex_le = LabelEncoder()
x_sex = sex_le.fit_transform(titanic_df['Sex'].values)
x_sex

In [None]:
# 'Embarked'
embarked_list = ['C', 'Q', 'S']
embarked_ohe = OneHotEncoder(categories=[embarked_list], handle_unknown='ignore')
x_embarked = embarked_ohe.fit_transform(titanic_df['Embarked'].values.reshape(-1,1)).toarray()
x_embarked

In [23]:
columns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex', 
           'Embarked_C', 'Embarked_Q', 'Embarked_S']
x = np.concatenate([x_0, x_sex.reshape(-1,1), x_embarked], axis=1)
preprocced_titanic_df = pd.DataFrame(x, columns=columns)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex,Embarked_C,Embarked_Q,Embarked_S
0,0.827377,-5.924806e-01,0.432793,-0.473674,-0.502445,1.0,0.0,0.0,1.0
1,-1.566107,6.387890e-01,0.432793,-0.473674,0.786845,0.0,1.0,0.0,0.0
2,0.827377,-2.846632e-01,-0.474545,-0.473674,-0.488854,0.0,0.0,0.0,1.0
3,-1.566107,4.079260e-01,0.432793,-0.473674,0.420730,0.0,0.0,0.0,1.0
4,0.827377,4.079260e-01,-0.474545,-0.473674,-0.486337,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
886,-0.369365,-2.077088e-01,-0.474545,-0.473674,-0.386671,1.0,0.0,0.0,1.0
887,-1.566107,-8.233437e-01,-0.474545,-0.473674,-0.044381,0.0,0.0,0.0,1.0
888,0.827377,-2.232906e-16,0.432793,2.008933,-0.176263,0.0,0.0,0.0,1.0
889,-1.566107,-2.846632e-01,-0.474545,-0.473674,-0.044381,1.0,1.0,0.0,0.0
