In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [3]:
data = pd.read_csv('titanic_train.csv')
test = pd.read_csv('titanic_test.csv')

In [3]:
data.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [30]:
data.Age.fillna(data['Age'].mean(), inplace=True)
data['Cabin'].fillna('N', inplace=True)
data['Embarked'].fillna('N', inplace=True)

print('# of null', data.isnull().sum().sum())

# of null 0


In [2]:
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [12]:
def encode_feature(df) :
    features = ['Cabin','Sex','Embarked']
    
    for feature in features :
        le = preprocessing.LabelEncoder()
        le = le.fit(df.loc[:,feature])
        df.loc[:,feature] = le.transform(df.loc[:,feature])
        
    return(df)

In [21]:
data = encode_feature(data)
data.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,1,22.0,1,0,7.25,7,3
1,1,1,0,38.0,1,0,71.2833,2,0


In [13]:
def titanic_fillna(df) :
    df.Age.fillna(data['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    return(df)

In [14]:
def drop_feature(df) :
    df.drop(['PassengerId','Name','Ticket'], axis=1, inplace=True)
    return df

In [15]:
# 전처리 기본 2 : lambda 이용해서 Cabin의 값 1글자만 사용
def format_features(df) :
    df.loc[:,'Cabin'] = df.loc[:,'Cabin'].apply(lambda x : str(x)[:1])
    
    features = ['Cabin','Sex','Embarked']
    for feature in features :
        le = preprocessing.LabelEncoder()
        le = le.fit(df.loc[:,feature])
        df.loc[:,feature] = le.transform(df.loc[:,feature])
        
    return(df)
    

In [16]:
def titanic_transform(df) :
    df = titanic_fillna(df)
    df = drop_feature(df)
    df = format_features(df)
    return(df)

In [10]:
data_train = titanic_transform(data)

In [11]:
y_titanic_train = data_train.loc[:,'Survived']
X_titanic_train = data_train.drop('Survived',axis=1)

In [49]:
X_titanic_train.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,1,22.0,1,0,7.25,7,3
1,1,0,38.0,1,0,71.2833,2,0


In [3]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_titanic_train, y_titanic_train, test_size=0.2,
                                                    random_state=11)

In [52]:
y_titanic_train.value_counts() / len(y_titanic_train)

0    0.616162
1    0.383838
Name: Survived, dtype: float64

In [53]:
y_train.value_counts()/len(y_train)

0    0.605337
1    0.394663
Name: Survived, dtype: float64

In [54]:
y_test.value_counts()/len(y_test)

0    0.659218
1    0.340782
Name: Survived, dtype: float64

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_titanic_train, y_titanic_train,
                                                    stratify = y_titanic_train,
                                                    test_size=0.2,
                                                    random_state=11)

In [None]:
# 정답 비율 유지하게 나눠서 해줌
# 타겟뿐만 아니라 피쳐에 대해서만 가능함

In [56]:
y_train.value_counts()/len(y_train) # 엇비슷하게 짤라줌

0    0.616573
1    0.383427
Name: Survived, dtype: float64

In [57]:
y_test.value_counts()/len(y_test)

0    0.614525
1    0.385475
Name: Survived, dtype: float64

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

In [5]:
from sklearn.neighbors import KNeighborsClassifier

In [6]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import cross_val_score

In [7]:
from sklearn.svm import SVC

In [69]:
knn = KNeighborsClassifier()

In [70]:
scores = cross_val_score(knn, X_train, y_train, cv=5)
for iter_count, accuracy in enumerate(scores) :
    print('knn 교차검증 {0} & 정확도 {1:.4f}'.format(iter_count, accuracy))
print('---------------------------------------------------------')
print('knn 교차검증 평균 정확도 : {0:.4f}'.format(scores.mean()))

knn 교차검증 0 & 정확도 0.7413
knn 교차검증 1 & 정확도 0.6853
knn 교차검증 2 & 정확도 0.7483
knn 교차검증 3 & 정확도 0.7676
knn 교차검증 4 & 정확도 0.7376
---------------------------------------------------------
knn 교차검증 평균 정확도 : 0.7360


In [None]:
# base 라인 확인

In [71]:
parameters = {
    'n_neighbors':[1,3,5,7],
    'weights':['uniform']
}

In [72]:
n_iter_search=10
knn_rgs = RandomizedSearchCV(knn, param_distributions=parameters, cv=7,
                            scoring='accuracy', n_jobs=-1,
                            verbose=1,random_state=1234,
                            n_iter=n_iter_search)
knn_rgs.fit(X_train, y_train)

# verbos = 로그를 어느단계까지 찍나

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 7 folds for each of 4 candidates, totalling 28 fits


[Parallel(n_jobs=-1)]: Done  28 out of  28 | elapsed:    2.0s finished


RandomizedSearchCV(cv=7, error_score='raise-deprecating',
                   estimator=KNeighborsClassifier(algorithm='auto',
                                                  leaf_size=30,
                                                  metric='minkowski',
                                                  metric_params=None,
                                                  n_jobs=None, n_neighbors=5,
                                                  p=2, weights='uniform'),
                   iid='warn', n_iter=10, n_jobs=-1,
                   param_distributions={'n_neighbors': [1, 3, 5, 7],
                                        'weights': ['uniform']},
                   pre_dispatch='2*n_jobs', random_state=1234, refit=True,
                   return_train_score=False, scoring='accuracy', verbose=1)

In [73]:
# best 모델 기준으로 가장 베스트 파라미터값에 대해서 확인해야함

knn_rgs.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [74]:
parameters = {
    'n_neighbors':[3,5,7],
    'weights':['uniform']
}

In [75]:
n_iter_search=10
grid_knn_clf = GridSearchCV(knn, param_grid=parameters, cv=7,
                            scoring='accuracy', n_jobs=-1)

grid_knn_clf.fit(X_train, y_train)

# verbos = 로그를 어느단계까지 찍나

GridSearchCV(cv=7, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=-1,
             param_grid={'n_neighbors': [3, 5, 7], 'weights': ['uniform']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [76]:
grid_knn_clf.best_params_
# 지정했던 것에 대한 정보만 줌

{'n_neighbors': 5, 'weights': 'uniform'}

In [77]:
grid_knn_clf.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [78]:
grid_knn_clf.best_score_

0.7289325842696629

In [79]:
# 학습과정이 끝난 애를 기준으로 knn_best에다가 저장함
knn_best = grid_knn_clf.best_estimator_

knn_predictions = knn_best.predict(X_test)
accuracy = accuracy_score(y_test,knn_predictions)
print('knn에서 GridSearchCV로 찾은 결과 : {0:.4f}'.format(accuracy))

knn에서 GridSearchCV로 찾은 결과 : 0.7318


In [None]:
# 최적화를 했기때문에 hyper parameter tunning을 완료했음.

In [45]:
X_train.shape

(712, 8)

In [None]:
# RandomForest 했을때 

In [26]:
rfc = RandomForestClassifier()

In [84]:
scores = cross_val_score(rfc, X_train, y_train, cv=5)
for iter_count, accuracy in enumerate(scores) :
    print('rfc 교차검증 {0} & 정확도 {1:.4f}'.format(iter_count, accuracy))
print('---------------------------------------------------------')
print('rfc 교차검증 평균 정확도 : {0:.4f}'.format(scores.mean()))

rfc 교차검증 0 & 정확도 0.7972
rfc 교차검증 1 & 정확도 0.7483
rfc 교차검증 2 & 정확도 0.8252
rfc 교차검증 3 & 정확도 0.8028
rfc 교차검증 4 & 정확도 0.7872
---------------------------------------------------------
rfc 교차검증 평균 정확도 : 0.7921




In [60]:
parameters = {
    'n_estimators':np.arange(10,110,10),
    'max_features':np.arange(1,8),
    'min_samples_split':np.arange(2,5),
    'max_leaf_nodes':np.arange(2,15)
}

In [61]:
n_iter_search=10
rfc_rgs = RandomizedSearchCV(rfc, param_distributions=parameters, cv=7,
                            scoring='accuracy', n_jobs=-1,
                            verbose=1,random_state=1234,
                            n_iter=n_iter_search)
rfc_rgs.fit(X_train, y_train)

Fitting 7 folds for each of 10 candidates, totalling 70 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:    0.5s finished


RandomizedSearchCV(cv=7, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=None

In [62]:
rfc_rgs.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features=4, max_leaf_nodes=9,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=3,
                       min_weight_fraction_leaf=0.0, n_estimators=90,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [63]:
parameters = {
    'n_estimators':np.arange(40,60),
    'max_features':np.arange(1,3),
    'min_samples_split':np.arange(2,5),
    'max_leaf_nodes':np.arange(10,15)
}

In [64]:
n_iter_search=10
grid_rfc_clf = GridSearchCV(rfc, param_grid=parameters, cv=7,
                            scoring='accuracy', n_jobs=-1)

grid_rfc_clf.fit(X_train, y_train)

GridSearchCV(cv=7, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=Fa...
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid=

In [65]:
grid_rfc_clf.best_params_

{'max_features': 2,
 'max_leaf_nodes': 14,
 'min_samples_split': 2,
 'n_estimators': 43}

In [66]:
grid_rfc_clf.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features=2, max_leaf_nodes=14,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=43,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [67]:
rfc_best = grid_rfc_clf.best_estimator_

rfc_predictions = rfc_best.predict(X_test)
accuracy = accuracy_score(y_test,rfc_predictions)
print('rfc에서 GridSearchCV로 찾은 결과 : {0:.4f}'.format(accuracy))

rfc에서 GridSearchCV로 찾은 결과 : 0.8380


In [19]:
svc = SVC()

In [20]:
scores = cross_val_score(svc, X_train, y_train, cv=5)
for iter_count, accuracy in enumerate(scores) :
    print('svc 교차검증 {0} & 정확도 {1:.4f}'.format(iter_count, accuracy))
print('---------------------------------------------------------')
print('svc 교차검증 평균 정확도 : {0:.4f}'.format(scores.mean()))

svc 교차검증 0 & 정확도 0.7133
svc 교차검증 1 & 정확도 0.6573
svc 교차검증 2 & 정확도 0.6713
svc 교차검증 3 & 정확도 0.7183
svc 교차검증 4 & 정확도 0.7234
---------------------------------------------------------
svc 교차검증 평균 정확도 : 0.6967




In [34]:
parameters = {
    'C':np.arange(1,5),
    'kernel':['rbf','poly','linear'],
    'degree':np.arange(2,5),
    'gamma': [0.01,0.1,1,10]
}

In [35]:
n_iter_search=10
svc_rgs = RandomizedSearchCV(svc, param_distributions=parameters, cv=7,
                            scoring='accuracy', n_jobs=-1,
                            verbose=1,random_state=1234,
                            n_iter=n_iter_search)
svc_rgs.fit(X_train, y_train)

Fitting 7 folds for each of 10 candidates, totalling 70 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 15.3min
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed: 48.3min finished


RandomizedSearchCV(cv=7, error_score='raise-deprecating',
                   estimator=SVC(C=1.0, cache_size=200, class_weight=None,
                                 coef0=0.0, decision_function_shape='ovr',
                                 degree=3, gamma='auto_deprecated',
                                 kernel='rbf', max_iter=-1, probability=False,
                                 random_state=None, shrinking=True, tol=0.001,
                                 verbose=False),
                   iid='warn', n_iter=10, n_jobs=-1,
                   param_distributions={'C': array([1, 2, 3, 4]),
                                        'degree': array([2, 3, 4]),
                                        'gamma': [0.01, 0.1, 1, 10],
                                        'kernel': ['rbf', 'poly', 'linear']},
                   pre_dispatch='2*n_jobs', random_state=1234, refit=True,
                   return_train_score=False, scoring='accuracy', verbose=1)

In [36]:
svc_rgs.best_estimator_

SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=2, gamma=0.1, kernel='poly',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [21]:
parameters = {
    'C':np.arange(2,3),
    'kernel':['poly'],
    'degree':np.arange(2,3),
    'gamma': np.arange(0.09,0.12,0.01)
}
# 가짓수 20개 -> 느려서 조정 3개

In [22]:
n_iter_search=10
grid_svc_clf = GridSearchCV(svc, param_grid=parameters, cv=7,
                            scoring='accuracy', n_jobs=-1)

grid_svc_clf.fit(X_train, y_train)


GridSearchCV(cv=7, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': array([2]), 'degree': array([2]),
                         'gamma': array([0.09, 0.1 , 0.11]),
                         'kernel': ['poly']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [23]:
svc_best = grid_svc_clf.best_estimator_

svc_predictions = svc_best.predict(X_test)
accuracy = accuracy_score(y_test,svc_predictions)
print('svc에서 GridSearchCV로 찾은 결과 : {0:.4f}'.format(accuracy))

svc에서 GridSearchCV로 찾은 결과 : 0.8212


In [None]:
# model 저장 및 로드

In [68]:
from sklearn.externals import joblib



In [80]:
# model 저장하는 것
# 확장자는 다른걸로 있음.
joblib.dump(rfc_best, 'rfc_best_model.pkl')

['rfc_best_model.pkl']

In [90]:
rf_best_p = joblib.load('rfc_best_model.pkl')

In [91]:
rf_best_p

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features=2, max_leaf_nodes=14,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=43,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [84]:
from joblib import dump, load

In [85]:
dump(rfc_best, 'rfc_best.pkl')

['rfc_best.pkl']

In [87]:
rf_best_p2 = load('rfc_best.pkl')

In [92]:
rf_best_p == rf_best_p2

False

In [95]:
dump(svc_best,'svm_best.pkl')
dump(knn_best,'knn_best.pkl')

['knn_best.pkl']

In [None]:
# 다른걸로 표시되나 다른오브젝트느낌인가봄, 파라미터가 똑같음

In [93]:
print(X_test.shape)

print((rf_best_p.predict(X_test)==rf_best_p2.predict(X_test)).sum())

(179, 8)
179


In [None]:
# voting - 

In [100]:
from sklearn.ensemble import VotingClassifier

In [None]:

svc_predictions = svc_best.predict(X_test)
accuracy_svm = accuracy_score(y_test,svc_predictions)

print('svc에서 GridSearchCV로 찾은 결과 : {0:.4f}'.format(accuracy))

In [96]:
accuracy_rf = accuracy_score(y_test, rfc_best.predict(X_test))
accuracy_sv = accuracy_score(y_test, svc_best.predict(X_test))
accuracy_kn = accuracy_score(y_test, knn_best.predict(X_test))

In [97]:
print(accuracy_rf,accuracy_sv,accuracy_kn)

0.8379888268156425 0.8212290502793296 0.7318435754189944


In [None]:
# 결과를 보는게 문제가 생겨서

In [102]:
eclf = VotingClassifier(estimators= [('knn',knn_best), ('rfc',rfc_best), ('svm',svc_best)],
                       voting='hard')
# voting = 'hard' (voting별로 같은 가중치)

eclf.fit(X_train, y_train)
eclf_pred = eclf.predict(X_test)
accuracy = accuracy_score(y_test, eclf_pred)
print('Hard Voting 에서로 찾은 결과 : {0:.4f}'.format(accuracy))

Hard Voting 에서로 찾은 결과 : 0.8547


In [138]:
rfc_best.feature_importances_

array([0.13773857, 0.41805803, 0.09565523, 0.04538915, 0.03591015,
       0.14470896, 0.088098  , 0.03444192])

In [140]:
feature_importance = pd.DataFrame({'feature':X_train.columns, 'importance':rfc_best.feature_importances_})

feature_importance.sort_values(by='importance', ascending=False)

Unnamed: 0,feature,importance
1,Sex,0.418058
5,Fare,0.144709
0,Pclass,0.137739
2,Age,0.095655
6,Cabin,0.088098
3,SibSp,0.045389
4,Parch,0.03591
7,Embarked,0.034442


In [104]:
x_test_model_total = X_test.copy(deep=True)
x_test_model_total['knnc'] = knn_best.predict(X_test)
x_test_model_total['rfc'] = rfc_best.predict(X_test)
x_test_model_total['svm'] = svc_best.predict(X_test)

In [105]:
x_test_model_total.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,knnc,rfc,svm
212,3,1,22.0,0,0,7.25,7,3,0,0,0
456,1,1,65.0,0,0,26.55,4,3,0,0,0


In [106]:
x_test_model_total.index

Int64Index([212, 456, 557, 763, 682,  44, 605, 774, 632, 645,
            ...
            788, 823, 535, 830, 693, 119,   4, 352, 499,  37],
           dtype='int64', length=179)

In [107]:
x_test_model_total['voting'] = x_test_model_total.apply(lambda x : 1 if x['knnc']+x['rfc']+x['svm']>=2 else 0)

KeyError: ('knnc', 'occurred at index Pclass')

In [119]:
def voting(df) :
    voting = np.array([])
    for idx in df.index :
        if df.at[idx,'knnc'] + df.at[idx,'rfc'] + df.at[idx,'svm'] >= 2:
            voting = np.append(voting, 1)
        else :
            voting = np.append(voting, 0)
    return(voting.astype('int'))

In [120]:
x_test_model_total['voting'] = voting(x_test_model_total)

In [8]:
import collections

In [126]:
collections.Counter([1,0,0,0]).most_common()[0]
# 카운트 수가 적은거부터 나옴

(0, 3)

In [None]:
# 아래는 어플라이용으로 만드는 함수

In [127]:
def vote_func(x, best_model_idx=0) :
    p = collectionsCounter(x)
    
    if p.most_common()[0][1] >=2 :
        return(p.most_common()[0][0])
    else :
        print('Check!! Voting Result!!')
        return(x[best_model_idx])

In [None]:
x_test_model_total['ensemble'] = x_test_model_total.loc[:,['knnc','rfc','svm']].apply(lambda x : vote_func(x),axis=1)

In [128]:
x_test_model_total['y_real'] = y_test

In [132]:
x_test_model_total.head(5)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,knnc,rfc,svm,voting,y_real
212,3,1,22.0,0,0,7.25,7,3,0,0,0,0,0
456,1,1,65.0,0,0,26.55,4,3,0,0,0,0,0
557,1,1,29.699118,0,0,227.525,7,0,1,0,1,1,0
763,1,0,36.0,1,2,120.0,1,3,1,1,1,1,1
682,3,1,20.0,0,0,9.225,7,3,0,0,0,0,0


In [131]:
show_col_list = ['knnc','rfc','svm','voting','y_real']

x_test_model_total.loc[:,show_col_list]

Unnamed: 0,knnc,rfc,svm,voting,y_real
212,0,0,0,0,0
456,0,0,0,0,0
557,1,0,1,1,0
763,1,1,1,1,1
682,0,0,0,0,0
...,...,...,...,...,...
119,0,0,0,0,0
4,0,0,0,0,0
352,1,0,0,0,0
499,0,0,0,0,0


In [133]:
accuracy_score(x_test_model_total['y_real'], x_test_model_total['voting'])

0.8435754189944135

In [9]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

In [10]:
data = pd.read_csv('titanic_train.csv')
test = pd.read_csv('titanic_test.csv')



In [17]:
data_train = titanic_transform(data)

In [18]:
y_titanic_train = data_train.loc[:,'Survived']
X_titanic_train = data_train.drop('Survived',axis=1)

X_train, X_test, y_train, y_test = train_test_split(X_titanic_train, y_titanic_train,
                                                    stratify = y_titanic_train,
                                                    test_size=0.2,
                                                    random_state=11)

In [147]:
data.shape

(891, 9)

In [141]:
feature_importance = pd.DataFrame({'feature':X_train.columns, 'importance':rfc_best.feature_importances_})

feature_importance.sort_values(by='importance', ascending=False)

# 상위 6개로 중요도가 90가넘어감

Unnamed: 0,feature,importance
1,Sex,0.418058
5,Fare,0.144709
0,Pclass,0.137739
2,Age,0.095655
6,Cabin,0.088098
3,SibSp,0.045389
4,Parch,0.03591
7,Embarked,0.034442


In [19]:
# 칼럼이 너무 적을땐 할 필요 없음
# normalize 하면 특징이 사라져서 오히려 안될 수 있음

num_comp = 6 #여러 칼럼중에 최대 몇개까지 쓸 칼럼수
pca = PCA(n_components = num_comp)

train_ext = pca.fit_transform(X_train) 

test_ext = pca.transform(X_test)

min_max_scaler = MinMaxScaler()

train_norm = min_max_scaler.fit_transform(train_ext)
test_norm = min_max_scaler.fit_transform(test_ext)




In [20]:
# knn 모델선언
knn = KNeighborsClassifier()


In [37]:
parameters = {
    'n_neighbors':[1,3,5,7],
    'weights':['uniform','distance'],
    'p':[1,2,3,4]
}
n_iter_search=10
knn_rgs = RandomizedSearchCV(knn, param_distributions=parameters, cv=10,
                            scoring='accuracy', n_jobs=-1,
                            verbose=1,random_state=1234,
                            n_iter=n_iter_search)
knn_rgs.fit(X_train, y_train)


Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.1s finished


RandomizedSearchCV(cv=10, error_score='raise-deprecating',
                   estimator=KNeighborsClassifier(algorithm='auto',
                                                  leaf_size=30,
                                                  metric='minkowski',
                                                  metric_params=None,
                                                  n_jobs=None, n_neighbors=5,
                                                  p=2, weights='uniform'),
                   iid='warn', n_iter=10, n_jobs=-1,
                   param_distributions={'n_neighbors': [1, 3, 5, 7],
                                        'p': [1, 2, 3, 4],
                                        'weights': ['uniform', 'distance']},
                   pre_dispatch='2*n_jobs', random_state=1234, refit=True,
                   return_train_score=False, scoring='accuracy', verbose=1)

In [38]:
knn_rgs.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='uniform')

In [39]:
parameters = {
    'n_neighbors':[3,5,7],
    'weights':['uniform','distance'],
    'p':[1,2,3]
}
n_iter_search=10
grid_knn_clf = GridSearchCV(knn, param_grid=parameters, cv=10,
                            scoring='accuracy', n_jobs=-1)

grid_knn_clf.fit(X_train, y_train)





GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=-1,
             param_grid={'n_neighbors': [3, 5, 7], 'p': [1, 2, 3],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [40]:
# 학습과정이 끝난 애를 기준으로 knn_best에다가 저장함
knn_best = grid_knn_clf.best_estimator_

knn_predictions = knn_best.predict(X_test)
accuracy = accuracy_score(y_test,knn_predictions)
print('knn에서 GridSearchCV로 찾은 결과 : {0:.4f}'.format(accuracy))

knn에서 GridSearchCV로 찾은 결과 : 0.7207


In [184]:
dump(knn_best,'knn_best_pca.pkl')

['knn_best_pca.pkl']

In [27]:
# random forest 모델선언
rfc = RandomForestClassifier()


In [28]:
parameters = {
    'n_estimators':np.arange(10,110,10),
    'max_features':np.arange(1,6),
    'min_samples_split':np.arange(2,5),
    'max_leaf_nodes':np.arange(2,15)
}
n_iter_search=10
rfc_rgs = RandomizedSearchCV(rfc, param_distributions=parameters, cv=7,
                            scoring='accuracy', n_jobs=-1,
                            verbose=1,random_state=1234,
                            n_iter=n_iter_search)
rfc_rgs.fit(X_train, y_train)


Fitting 7 folds for each of 10 candidates, totalling 70 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:    0.6s finished


RandomizedSearchCV(cv=7, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=None

In [29]:
rfc_rgs.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features=2, max_leaf_nodes=11,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [30]:
parameters = {
    'n_estimators':np.arange(90,101),
    'max_features':np.arange(1,6),
    'min_samples_split':np.arange(2,5),
    'max_leaf_nodes':np.arange(10,16)
}
n_iter_search=10
grid_rfc_clf = GridSearchCV(rfc, param_grid=parameters, cv=7,
                            scoring='accuracy', n_jobs=-1)

grid_rfc_clf.fit(X_train, y_train)


GridSearchCV(cv=7, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [31]:
grid_rfc_clf.best_params_

{'max_features': 3,
 'max_leaf_nodes': 15,
 'min_samples_split': 3,
 'n_estimators': 90}

In [32]:
rfc_best = grid_rfc_clf.best_estimator_

rfc_predictions = rfc_best.predict(X_test)
accuracy = accuracy_score(y_test,rfc_predictions)
print('rfc에서 GridSearchCV로 찾은 결과 : {0:.4f}'.format(accuracy))

rfc에서 GridSearchCV로 찾은 결과 : 0.7821


In [194]:
dump(rfc_best,'rfc_best_pca.pkl')

['rfc_best_pca.pkl']

In [None]:
svc = SCV()

In [None]:
parameters = {
    'C':np.arange(1,5),
    'kernel':['rbf','poly','linear'],
    'degree':np.arange(2,5),
    'gamma': [0.01,0.1,1,10]
}

n_iter_search=10
svc_rgs = RandomizedSearchCV(svc, param_distributions=parameters, cv=7,
                            scoring='accuracy', n_jobs=-1,
                            verbose=1,random_state=1234,
                            n_iter=n_iter_search)
svc_rgs.fit(X_train, y_train)

Fitting 7 folds for each of 10 candidates, totalling 70 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


In [None]:
svc_rgs.best_estimator_

In [None]:
parameters = {
    'C':np.arange(2,3),
    'kernel':['poly'],
    'degree':np.arange(2,3),
    'gamma': np.arange(0.09,0.12,0.01)
}
# 가짓수 20개 -> 느려서 조정 3개

n_iter_search=10
grid_svc_clf = GridSearchCV(svc, param_grid=parameters, cv=7,
                            scoring='accuracy', n_jobs=-1)

grid_svc_clf.fit(X_train, y_train)
