In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

%matplotlib inline 
#notebook을 실행한 브라우저에서 바로 시각화 자료를 볼수있게 함

titanic_df = pd.read_csv('/kaggle/input/titanic/train.csv') #같은 디렉토리안의 train.csv 파일을 df형식으로 읽어와서 객체 생성
titanic_df.head(3) #상위 3개의 row만 뽑아옴

In [None]:
print(titanic_df.info())

In [None]:
titanic_df.describe().transpose()

In [None]:
print('결측지 처리전 데이터 세트 Null 값 갯수 \n',titanic_df.isnull().sum())

In [None]:
titanic_df['Age'].fillna(titanic_df['Age'].mean(), inplace=True) #inplace 파라미터 True ->적용한것을 바로 반환
titanic_df['Cabin'].fillna('N', inplace=True) 
titanic_df['Embarked'].fillna('N', inplace=True)
print('결측지 처리후 데이터 세트 Null 값 갯수 ',titanic_df.isnull().sum())

In [None]:
# 학습 및 예측에  불필요한 피처들 제거
def drop_features(df):
    df.drop(['Name','Ticket'], axis=1, inplace=True)
    return df

# 레이블 인코딩 수행. 
def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin','Sex','Embarked']
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

# 데이터 전처리 함수 정의
def transform_features(df):
    df = drop_features(df)
    df = format_features(df)
    return df

In [None]:
y_titanic_df = titanic_df['Survived']
X_titanic_df = titanic_df.drop('Survived',axis=1, inplace=False)
X_titanic_df = transform_features(X_titanic_df) #전처리 적용
X_train, X_test, y_train, y_test=train_test_split(X_titanic_df, y_titanic_df, \
                                                  test_size=0.2, random_state=11)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
X_test

In [None]:
test_data=pd.read_csv('/kaggle/input/titanic/test.csv')

test_data=transform_features(test_data)
test_data['Age'].fillna(test_data['Age'].mean(), inplace=True) #inplace 파라미터 True ->적용한것을 바로 반환
test_data['Cabin'].fillna('N', inplace=True) 
test_data['Embarked'].fillna('N', inplace=True)
test_data.dropna(inplace=True)
test_data.loc[418]=[1310,3,1,30.27259,1,1,22.3583,7,0]
test_data

In [None]:
# 결정트리, Random Forest, 로지스틱 회귀를 위한 사이킷런 Classifier 클래스 생성
dt_clf = DecisionTreeClassifier(random_state=11)
rf_clf = RandomForestClassifier(random_state=11)
lr_clf = LogisticRegression(solver='liblinear')

# DecisionTreeClassifier 학습/예측/평가
dt_clf.fit(X_train , y_train)
dt_pred = dt_clf.predict(X_test)
print('DecisionTreeClassifier 정확도: {0:.4f}'.format(accuracy_score(y_test, dt_pred)))

# RandomForestClassifier 학습/예측/평가
rf_clf.fit(X_train , y_train)
rf_pred = rf_clf.predict(X_test)
print('RandomForestClassifier 정확도:{0:.4f}'.format(accuracy_score(y_test, rf_pred)))

# LogisticRegression 학습/예측/평가
lr_clf.fit(X_train , y_train)
lr_pred = lr_clf.predict(X_test)
print('LogisticRegression 정확도: {0:.4f}'.format(accuracy_score(y_test, lr_pred)))


In [None]:
scores = cross_val_score(dt_clf, X_titanic_df , y_titanic_df , cv=5)
for iter_count,accuracy in enumerate(scores):
    print("교차 검증 {0} 정확도: {1:.4f}".format(iter_count, accuracy))

print("평균 정확도: {0:.4f}".format(np.mean(scores)))

In [None]:
parameters = {'max_depth':[2,3,5,10],
             'min_samples_split':[2,3,5], 'min_samples_leaf':[1,5,8]} #파라미터 설정

grid_dclf = GridSearchCV(dt_clf, param_grid=parameters, scoring='accuracy', cv=5) #의사결정나무 모델로 GridSearchCV 적용

grid_dclf.fit(X_train, y_train)

print('GridSearchCV 최적 하이퍼 파라미터 :', grid_dclf.best_params_)
print('GridSearchCV 최고 정확도: {0:.4f}'.format(grid_dclf.best_score_))
best_dclf = grid_dclf.best_estimator_


# GridSearchCV의 최적 하이퍼 파라미터로 학습된 Estimator로 예측 및 평가 수행. 
dpredictions = best_dclf.predict(test_data)
my_first_submission = pd.DataFrame({"PassengerId": test_data.PassengerId, "Survived": dpredictions})
my_first_submission.to_csv("my_first_submission.csv", index=False)

In [None]:
dpredictions
len(dpredictions)
#test_prediction=best_dclf.predict(test_data)