In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
df= pd.read_csv('titanic_train_feature.csv', index_col=0)

In [7]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Family,Age_low,Age_middle,Age_high,Fare_log
0,1,0,3,0,22.0,7.25,0,0,1,False,True,False,2.110213
1,2,1,1,1,38.0,71.2833,1,2,1,False,True,False,4.280593
2,3,1,3,1,26.0,7.925,0,1,0,False,True,False,2.188856
3,4,1,1,1,35.0,53.1,0,2,1,False,True,False,3.990834
4,5,0,3,0,35.0,8.05,0,0,0,False,True,False,2.202765


In [8]:
df.shape

(891, 13)

## 학습과 예측에 사용할 데이터셋 만들기

In [9]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'Fare', 'Embarked',
       'Title', 'Family', 'Age_low', 'Age_middle', 'Age_high', 'Fare_log'],
      dtype='object')

In [10]:
X=df[['Pclass', 'Sex', 'Embarked',
       'Title', 'Family', 'Age_low', 'Age_middle', 'Age_high', 'Fare_log']]
X.shape

(891, 9)

In [11]:
y=df['Survived']
y.shape

(891,)

In [14]:
y_train = y
X_train=X

In [15]:
# train set 문제와 정답의 데이터 수 확인
# 매번 다른 데이터셋을 가져옴

X_train.shape, y_train.shape

((891, 9), (891,))

In [17]:
test_df = pd.read_csv('titanic_test_feature.csv')
feature_names = ['Pclass', 'Sex', 'Embarked','Title', 'Family', 'Age_low', 'Age_middle', 'Age_high', 'Fare_log']
X_test = test_df[feature_names]
print(X_test.shape)
#X_test.head()`

(418, 9)


In [18]:
y_answer = pd.read_csv('gender_submission.csv')
y_test = y_answer['Survived']
y_test

0      0
1      1
2      0
3      0
4      1
      ..
413    0
414    1
415    0
416    0
417    0
Name: Survived, Length: 418, dtype: int64

In [19]:
# test set 문제와 정답의 데이터 수 확인

X_test.shape, y_test.shape

((418, 9), (418,))

## 여러 개의 알고리즘을 사용해서 비교하기 

In [22]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

estimators = [DecisionTreeClassifier(random_state=42),
             RandomForestClassifier(random_state=42),
             GradientBoostingClassifier(random_state=42)
            ]
estimators

[DecisionTreeClassifier(random_state=42),
 RandomForestClassifier(random_state=42),
 GradientBoostingClassifier(random_state=42)]

In [23]:
max_depth = np.random.randint(2, 20, 10)

In [24]:
max_features = np.random.uniform(0.3, 1.0, 10)

In [25]:
from sklearn.model_selection import RandomizedSearchCV

max_depth = np.random.randint(2, 20, 10)
max_features = np.random.uniform(0.3, 1.0, 10)
param_distributions = {'max_depth' : max_depth, 
                       'max_features' : max_features }
results=[]
for estimator in estimators: # 순회하면서 돈다
    result=[]
    if estimator.__class__.__name__ != 'DecisionTreeClassifier':
        param_distributions['n_estimators'] = np.random.randint(100,200,10)
        
    clf = RandomizedSearchCV(estimator, 
                   param_distributions=param_distributions, 
                   n_iter=100,
                  scoring='accuracy', 
                   n_jobs=-1, cv=5, verbose =2)
    clf.fit(X_train, y_train)
    result.append(estimator.__class__.__name__)
    result.append(clf.best_params_)
    result.append(clf.best_score_)
    result.append(clf.score(X_test, y_test))
    result.append(clf.cv_results_)
    results.append(result)
                         

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   15.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   38.5s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  2.0min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   21.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.3min finished


In [26]:
pd.DataFrame(results, columns=['estimator', 'best_params', 'train_score', 'test_score', 'cv_result'])

Unnamed: 0,estimator,best_params,train_score,test_score,cv_result
0,DecisionTreeClassifier,"{'max_features': 0.5971479266207025, 'max_dept...",0.828297,0.861244,"{'mean_fit_time': [0.014359521865844726, 0.012..."
1,RandomForestClassifier,"{'n_estimators': 106, 'max_features': 0.702220...",0.837267,0.870813,"{'mean_fit_time': [0.9097659111022949, 0.88901..."
2,GradientBoostingClassifier,"{'n_estimators': 192, 'max_features': 0.432007...",0.844015,0.868421,"{'mean_fit_time': [1.092477560043335, 1.302914..."
