# 머신러닝 프로젝트 

## 타이타닉 생존자 예측모델 개발

- 데이터 : 
  - 제공 데이터 파일 : titanic3.csv
  - 훈련/검증용 데이터 : 평가 데이터 = 8 : 2
  - 훈련/검증용 데이터로 모델 학습 및 검증하고 평가 데이터는 최종 평가에만 사용


- 모델 개발 방법 :
  - 데이터 전처리 및 탐색적 분석을 통하여 파생변수 최소 2개 이상 개발
  - 알고리즘은 최소한 3개 이상 적용(Decision Tree, Random Forest, Logistic Regression은 필수)


- 훈련 및 평가 방법 :
  - GridSearchCV API를 활용하여 교차검증 및 최적 하이퍼파라미터 찾아서 학습 및 검증 수행


- 결과물 제출 : 이메일
 


In [0]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [0]:
titanic = pd.read_csv('titanic3.csv')

In [4]:
display (titanic.sample(15))
print (titanic.info(), '\n')
print (titanic.describe(), '\n')
display (titanic.isnull().sum)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
37,1,1,"Bradley, Mr. George (""George Arthur Brayton"")",male,,0,0,111427,26.55,,S,9.0,,"Los Angeles, CA"
1073,3,0,"O'Connor, Mr. Maurice",male,,0,0,371060,7.75,,Q,,,
489,2,1,"Louch, Mrs. Charles Alexander (Alice Adelaide ...",female,42.0,1,0,SC/AH 3085,26.0,,S,,,"Weston-Super-Mare, Somerset"
183,1,1,"Lesurer, Mr. Gustave J",male,35.0,0,0,PC 17755,512.3292,B101,C,3.0,,
1119,3,0,"Perkin, Mr. John Henry",male,22.0,0,0,A/5 21174,7.25,,S,,,
822,3,0,"Goldsmith, Mr. Nathan",male,41.0,0,0,SOTON/O.Q. 3101263,7.85,,S,,,"Philadelphia, PA"
1127,3,0,"Petroff, Mr. Nedelio",male,19.0,0,0,349212,7.8958,,S,,,
48,1,1,"Candee, Mrs. Edward (Helen Churchill Hungerford)",female,53.0,0,0,PC 17606,27.4458,,C,6.0,,"Washington, DC"
920,3,0,"Keane, Mr. Andrew ""Andy""",male,,0,0,12460,7.75,,Q,,,
9,1,0,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C,,22.0,"Montevideo, Uruguay"


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
pclass       1309 non-null int64
survived     1309 non-null int64
name         1309 non-null object
sex          1309 non-null object
age          1046 non-null float64
sibsp        1309 non-null int64
parch        1309 non-null int64
ticket       1309 non-null object
fare         1308 non-null float64
cabin        295 non-null object
embarked     1307 non-null object
boat         486 non-null object
body         121 non-null float64
home.dest    745 non-null object
dtypes: float64(3), int64(4), object(7)
memory usage: 143.3+ KB
None 

            pclass     survived  ...         fare        body
count  1309.000000  1309.000000  ...  1308.000000  121.000000
mean      2.294882     0.381971  ...    33.295479  160.809917
std       0.837836     0.486055  ...    51.758668   97.696922
min       1.000000     0.000000  ...     0.000000    1.000000
25%       2.000000     0.000000  ...     

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

# Preprocessing

In [55]:
titanic = pd.read_csv('titanic3.csv')

def nulls (df) :
    df['age'].fillna(round(df.age.mean(), 2), axis=0, inplace=True)
    df['boat'] = df['boat'].notnull().astype(int)
    # display (df['boat'])
    return df

def drops (df) :
    dropcols = ['name', 'cabin', 'body','home.dest','ticket','embarked', ]
    df.drop(dropcols, axis=1, inplace=True)
    return df

def formatting (df) :
    cols = [i for i in df.columns.values if df[i].dtype == 'object']
    # print (cols)

    for c in cols :
        df[c] = LabelEncoder().fit_transform(df[c])
    return df

def prepro (df) : return formatting(drops(nulls(df)))


titanic = prepro(titanic)
print (titanic.info())
display (titanic.sample(10))

display(titanic.groupby(['pclass','boat'])['survived'].sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 8 columns):
pclass      1309 non-null int64
survived    1309 non-null int64
sex         1309 non-null int64
age         1309 non-null float64
sibsp       1309 non-null int64
parch       1309 non-null int64
fare        1308 non-null float64
boat        1309 non-null int64
dtypes: float64(2), int64(6)
memory usage: 81.9 KB
None


Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,boat
989,3,0,1,29.88,0,0,8.05,0
481,2,1,0,22.0,1,2,41.5792,1
44,1,1,0,41.0,0,0,134.5,1
564,2,1,0,40.0,0,0,13.0,1
428,2,1,0,24.0,0,2,14.5,1
1154,3,0,1,29.88,0,0,8.05,0
998,3,0,1,29.88,0,0,7.8958,0
806,3,0,0,21.0,2,2,34.375,0
107,1,1,0,29.88,0,0,31.6833,1
195,1,1,0,16.0,0,0,86.5,1


pclass  boat
1       0         1
        1       199
2       0         8
        1       111
3       0        14
        1       167
Name: survived, dtype: int64