## 생존여부 예측모델 만들기
### 학습용 데이터 (X_train, y_train)을 이용하여 생존 예측 모형을 만든 후, 이를 평가용 데이터(X_test)에 적용하여 얻은 예측값을 다음과 같은 형식의 CSV파일로 생성하시오(제출한 모델의 성능은 accuracy 평가지표에 따라 채점)

(가) 제공 데이터 목록
- y_train: 생존여부(학습용)
- X_trian, X_test : 승객 정보 (학습용 및 평가용)

(나) 데이터 형식 및 내용
- y_trian (712명 데이터)

**시험환경 세팅은 예시문제와 동일한 형태의 X_train, y_train, X_test 데이터를 만들기 위함임**

### 유의사항
- 성능이 우수한 예측모형을 구축하기 위해서는 적절한 데이터 전처리, 피처엔지니어링, 분류알고리즘, 하이퍼파라미터 튜닝, 모형 앙상블 등이 수반되어야 한다.
- 수험번호.csv파일이 만들어지도록 코드를 제출한다.
- 제출한 모델의 성능은 accuracy로 평가함

csv 출력형태

![image.png](attachment:de1920de-121e-47c3-a61f-e905386713bf.png)

## [참고]작업형2 문구
- 출력을 원하실 경우 print() 함수 활용
- 예시) print(df.head())
- getcwd(), chdir() 등 작업 폴더 설정 불필요
- 파일 경로 상 내부 드라이브 경로(C: 등) 접근 불가

### 데이터 파일 읽기 예제
- import pandas as pd
- X_test = pd.read_csv("data/X_test.csv")
- X_train = pd.read_csv("data/X_train.csv")
- y_train = pd.read_csv("data/y_train.csv")

### 사용자 코딩

### 답안 제출 참고
- 아래 코드 예측변수와 수험번호를 개인별로 변경하여 활용
- pd.DataFrame({'cust_id': X_test.cust_id, 'gender': pred}).to_csv('003000000.csv', index=False)

In [1]:
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)
    
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])

    
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test 
    
df = pd.read_csv("../input/titanic/train.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='Survived', id_name='PassengerId')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 11), (179, 11), (712, 2), (179, 2))

In [2]:
display(X_train)
display(y_train)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
90,91,3,"Christmann, Mr. Emil",male,29.0,0,0,343276,8.0500,,S
103,104,3,"Johansson, Mr. Gustaf Joel",male,33.0,0,0,7540,8.6542,,S
577,578,1,"Silvey, Mrs. William Baird (Alice Munger)",female,39.0,1,0,13507,55.9000,E44,S
215,216,1,"Newell, Miss. Madeleine",female,31.0,1,0,35273,113.2750,D36,C
191,192,2,"Carbines, Mr. William",male,19.0,0,0,28424,13.0000,,S
...,...,...,...,...,...,...,...,...,...,...,...
621,622,1,"Kimball, Mr. Edwin Nelson Jr",male,42.0,1,0,11753,52.5542,D19,S
128,129,3,"Peter, Miss. Anna",female,,1,1,2668,22.3583,F E69,C
57,58,3,"Novel, Mr. Mansouer",male,28.5,0,0,2697,7.2292,,C
341,342,1,"Fortune, Miss. Alice Elizabeth",female,24.0,3,2,19950,263.0000,C23 C25 C27,S


Unnamed: 0,PassengerId,Survived
90,91,0
103,104,0
577,578,1
215,216,1
191,192,0
...,...,...
621,622,1
128,129,1
57,58,0
341,342,1


In [3]:
X_train.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            137
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          542
Embarked         1
dtype: int64

In [4]:
X_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             40
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          145
Embarked         1
dtype: int64

In [5]:
# Age 전처리
X_train.Age.astype('float64')
X_test.Age.astype('float64')

210    24.0
876    20.0
666    25.0
819    10.0
736    48.0
       ... 
494    21.0
871    47.0
530     2.0
157    30.0
45      NaN
Name: Age, Length: 179, dtype: float64

In [6]:
# Age의 결측치를 평균으로 채움
age_mean = X_train.Age.mean()
X_train.Age.fillna(age_mean, inplace=True)
X_test.Age.fillna(age_mean, inplace=True)

In [7]:
# Embarked 전처리
X_train.Embarked.value_counts()

S    514
C    141
Q     56
Name: Embarked, dtype: int64

In [8]:
# 결측치를 수가 가장 많은 'S'로 채움
X_train.Embarked.fillna('S', inplace=True)
X_test.Embarked.fillna('S', inplace=True)

In [9]:
# Embarked 컬럼 Label Encoding
X_train.Embarked = X_train.Embarked.map({'S':0, 'C':1, 'Q':2})
X_test.Embarked = X_test.Embarked.map({'S':0, 'C':1, 'Q':2})

In [10]:
# 학습에 도움이 되지 않을 것 같은 컬럼 제거
cols = ['PassengerId', 'Cabin', 'Name', 'Ticket']
X_train_drop = X_train.drop(columns = cols, axis = 1)
X_test_drop = X_test.drop(columns = cols, axis = 1)

In [11]:
# Sex 전처리 (Label Encoding)
X_train_drop.Sex = X_train_drop.Sex.map({'male':0, 'female':1})
X_test_drop.Sex = X_test_drop.Sex.map({'male':0, 'female':1})

In [12]:
X_train_drop.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 90 to 116
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    712 non-null    int64  
 1   Sex       712 non-null    int64  
 2   Age       712 non-null    float64
 3   SibSp     712 non-null    int64  
 4   Parch     712 non-null    int64  
 5   Fare      712 non-null    float64
 6   Embarked  712 non-null    int64  
dtypes: float64(2), int64(5)
memory usage: 44.5 KB


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

y = y_train['Survived']

x_train, x_val, Y_train, Y_val = train_test_split(X_train_drop, y, test_size=0.2, random_state=43)

rf = RandomForestClassifier(random_state=99)
xgb = XGBClassifier()

rf.fit(x_train, Y_train)
xgb.fit(x_train, Y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [14]:
# 학습 모델 성능 평가 및 검증
print('랜덤 포레스트')
print(accuracy_score(Y_train, rf.predict(x_train)))
print(accuracy_score(Y_val, rf.predict(x_val)))
print('XGBoost')
print(accuracy_score(Y_train, xgb.predict(x_train)))
print(accuracy_score(Y_val, xgb.predict(x_val)))

랜덤 포레스트
0.9912126537785588
0.8251748251748252
XGBoost
0.984182776801406
0.8391608391608392


In [15]:
pd.DataFrame({'PassengerId':X_test.PassengerId, 'Survived':xgb.predict(X_test_drop)}).to_csv('000000000.csv', index=False)

In [16]:
result = pd.read_csv('/kaggle/working/000000000.csv')
result

Unnamed: 0,PassengerId,Survived
0,211,0
1,877,0
2,667,0
3,820,0
4,737,1
...,...,...
174,495,0
175,872,1
176,531,1
177,158,0


In [17]:
rf.score(X_test_drop, y_test['Survived'])

0.7541899441340782

In [18]:
xgb.score(X_test_drop, y_test['Survived'])

0.770949720670391

# 풀이
---

## Start

## 라이브러리 및 데이터 불러오기

In [19]:
# 라이브러리 불러오기
import pandas as pd

In [20]:
# 데이터 불러오기 (생략)
X_train.shape, y_train.shape, X_test.shape

((712, 11), (712, 2), (179, 11))

## EDA

In [21]:
X_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
90,91,3,"Christmann, Mr. Emil",male,29.0,0,0,343276,8.05,,0
103,104,3,"Johansson, Mr. Gustaf Joel",male,33.0,0,0,7540,8.6542,,0
577,578,1,"Silvey, Mrs. William Baird (Alice Munger)",female,39.0,1,0,13507,55.9,E44,0
215,216,1,"Newell, Miss. Madeleine",female,31.0,1,0,35273,113.275,D36,1
191,192,2,"Carbines, Mr. William",male,19.0,0,0,28424,13.0,,0


In [22]:
# float64(2), int64(3), object(5)
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 90 to 116
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Pclass       712 non-null    int64  
 2   Name         712 non-null    object 
 3   Sex          712 non-null    object 
 4   Age          712 non-null    float64
 5   SibSp        712 non-null    int64  
 6   Parch        712 non-null    int64  
 7   Ticket       712 non-null    object 
 8   Fare         712 non-null    float64
 9   Cabin        170 non-null    object 
 10  Embarked     712 non-null    int64  
dtypes: float64(2), int64(5), object(4)
memory usage: 66.8+ KB


In [23]:
y_train.head()

Unnamed: 0,PassengerId,Survived
90,91,0
103,104,0
577,578,1
215,216,1
191,192,0


In [24]:
# 생존 비율
y_train['Survived'].value_counts()

0    441
1    271
Name: Survived, dtype: int64

## 데이터 전처리

In [25]:
y = y_train["Survived"]

# sex만 원핫인코딩 됨
features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(X_train[features])
test = pd.get_dummies(X_test[features])

In [26]:
X.shape, test.shape

((712, 5), (179, 5))

## 모델 및 평가

In [27]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=200, max_depth=7, random_state=2021)
model.fit(X, y)
predictions = model.predict(test)

In [28]:
model.score(X, y)

0.8356741573033708

In [29]:
output = pd.DataFrame({'PassengerId': X_test.PassengerId, 'Survived': predictions})
output.head()

Unnamed: 0,PassengerId,Survived
210,211,0
876,877,0
666,667,0
819,820,0
736,737,0


In [30]:
# 수험번호.csv로 출력
output.to_csv('1234567.csv', index=False)

## 결과 체점 (수험자는 알 수 없는 부분임)

In [31]:
model.score(test, y_test['Survived'])

0.7318435754189944