In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
data_train = pd.read_csv('input/train.csv')

print(data_train.shape)
data_train.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


去除“无用”的列，标称型数据转换。

In [3]:
m = {'Sex': {'male': 0, 'female': 1}, 'Embarked': {'S': 0, 'C': 1, 'Q': 2}}

data_train_basic = data_train.drop(['Name', 'Ticket', 'Cabin'], axis=1)
data_train_basic = data_train_basic.replace(m)

data_train_basic = data_train_basic.dropna(axis=0, how='any')

print(data_train_basic.shape)
data_train_basic.head()

(712, 9)


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,0,22.0,1,0,7.25,0.0
1,2,1,1,1,38.0,1,0,71.2833,1.0
2,3,1,3,1,26.0,0,0,7.925,0.0
3,4,1,1,1,35.0,1,0,53.1,0.0
4,5,0,3,0,35.0,0,0,8.05,0.0


In [4]:
X = data_train_basic.drop('Survived', axis=1).values
y = data_train_basic.loc[:, 'Survived'].values

print('X shape: ', X.shape)
print('y shape: ', y.shape)

X shape:  (712, 8)
y shape:  (712,)


逻辑回归，通过GridSearchCV选择参数

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

logistic = LogisticRegression()
estimator = GridSearchCV(logistic, {'C':[1, 10, 15, 20, 50]})
estimator.fit(X, y)

print('GridSearchCV result:')
for key in estimator.cv_results_:
    print(key, ': ', estimator.cv_results_[key], sep='')

print()
print('best estimator:')
print(estimator.best_estimator_)

GridSearchCV result:
split1_test_score: [ 0.78481013  0.78902954  0.78902954  0.78902954  0.78902954]
split0_train_score: [ 0.80590717  0.80801688  0.80590717  0.80801688  0.80590717]
param_C: [1 10 15 20 50]
split0_test_score: [ 0.79411765  0.79831933  0.80252101  0.79831933  0.79831933]
std_test_score: [ 0.00565429  0.00564632  0.00755824  0.00564632  0.00724322]
split2_test_score: [ 0.78059072  0.78481013  0.78481013  0.78481013  0.78059072]
std_fit_time: [  3.29838804e-03   3.67819421e-05   3.82919009e-05   1.92017054e-04
   8.63203311e-05]
mean_train_score: [ 0.79565327  0.8040758   0.80267081  0.8040758   0.80337257]
mean_test_score: [ 0.78651685  0.79073034  0.79213483  0.79073034  0.78932584]
split1_train_score: [ 0.78947368  0.80210526  0.80210526  0.80421053  0.80421053]
std_train_score: [ 0.00730137  0.00278676  0.00244453  0.00327426  0.00248332]
rank_test_score: [5 2 1 2 4]
mean_score_time: [ 0.00048963  0.00027061  0.00032036  0.00024597  0.00021799]
params: ({'C': 1}, {'

交叉验证cross_val_score

In [6]:
from sklearn.model_selection import cross_val_score

logistic = LogisticRegression(C=15)
score = cross_val_score(logistic, X, y, cv=5)
score

array([ 0.74825175,  0.82517483,  0.76923077,  0.76056338,  0.79432624])

决策树

In [10]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
estimator = GridSearchCV(dt, {'criterion':['gini', 'entropy'], 'max_features':['sqrt', 'log2'], 'random_state':[None]})
estimator.fit(X, y)

print('GridSearchCV result:')
for key in estimator.cv_results_:
    print(key, ': ', estimator.cv_results_[key], sep='')

print()
print('best estimator:')
print(estimator.best_estimator_)

GridSearchCV result:
split1_test_score: [ 0.73417722  0.57383966  0.60337553  0.69198312]
split0_train_score: [ 1.  1.  1.  1.]
params: ({'criterion': 'gini', 'random_state': None, 'max_features': 'sqrt'}, {'criterion': 'gini', 'random_state': None, 'max_features': 'log2'}, {'criterion': 'entropy', 'random_state': None, 'max_features': 'sqrt'}, {'criterion': 'entropy', 'random_state': None, 'max_features': 'log2'})
split0_test_score: [ 0.70168067  0.61764706  0.67226891  0.7394958 ]
std_test_score: [ 0.03657508  0.04323985  0.02836692  0.05033406]
split2_test_score: [ 0.64556962  0.67932489  0.64556962  0.81434599]
std_fit_time: [  3.80327249e-05   1.63417574e-04   7.60866203e-05   4.11289410e-04]
mean_train_score: [ 1.  1.  1.  1.]
param_criterion: ['gini' 'gini' 'entropy' 'entropy']
mean_test_score: [ 0.69382022  0.62359551  0.64044944  0.74859551]
split1_train_score: [ 1.  1.  1.  1.]
std_train_score: [ 0.  0.  0.  0.]
rank_test_score: [2 4 3 1]
mean_score_time: [ 0.00028094  0.0005

In [25]:
from sklearn.model_selection import cross_val_score

dt = DecisionTreeClassifier(criterion='entropy', max_features='log2')
score = cross_val_score(dt, X, y, cv=5)
score

array([ 0.6013986 ,  0.76223776,  0.69230769,  0.79577465,  0.81560284])

SVM

In [27]:
from sklearn.svm import LinearSVC

svc = LinearSVC()
estimator = GridSearchCV(svc, {'C':[0.1, 1, 10]})
estimator.fit(X, y)

print('GridSearchCV result:')
for key in estimator.cv_results_:
    print(key, ': ', estimator.cv_results_[key], sep='')

print()
print('best estimator:')
print(estimator.best_estimator_)

GridSearchCV result:
split1_test_score: [ 0.64135021  0.7257384   0.77637131]
split0_train_score: [ 0.40506329  0.66244726  0.66666667]
param_C: [0.1 1 10]
split0_test_score: [ 0.6302521   0.68907563  0.68487395]
std_test_score: [ 0.01977774  0.04690109  0.07401925]
split2_test_score: [ 0.59493671  0.80168776  0.59493671]
std_fit_time: [ 0.00245065  0.00146884  0.00016433]
mean_train_score: [ 0.60379303  0.72537716  0.70152047]
mean_test_score: [ 0.62219101  0.73876404  0.68539326]
split1_train_score: [ 0.68631579  0.72631579  0.8       ]
std_train_score: [ 0.1411944   0.05100317  0.07061926]
rank_test_score: [3 1 2]
mean_score_time: [ 0.0007844   0.00031964  0.00028841]
params: ({'C': 0.1}, {'C': 1}, {'C': 10})
split2_train_score: [ 0.72        0.78736842  0.63789474]
std_score_time: [  3.56974163e-04   3.33975178e-06   1.88097334e-05]
mean_fit_time: [ 0.02749801  0.02214766  0.01925858]

best estimator:
LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_s

In [28]:
from sklearn.model_selection import cross_val_score

svc = LinearSVC(C=1)
score = cross_val_score(svc, X, y, cv=5)
score

array([ 0.63636364,  0.74825175,  0.67132867,  0.59859155,  0.72340426])

In [30]:
from sklearn.svm import SVC

svc = SVC()
estimator = GridSearchCV(svc, {'C':[0.1, 1, 10]})
estimator.fit(X, y)

print('GridSearchCV result:')
for key in estimator.cv_results_:
    print(key, ': ', estimator.cv_results_[key], sep='')

print()
print('best estimator:')
print(estimator.best_estimator_)

GridSearchCV result:
split1_test_score: [ 0.59493671  0.58649789  0.57805907]
split0_train_score: [ 0.59493671  1.          1.        ]
param_C: [0.1 1 10]
split0_test_score: [ 0.59663866  0.59663866  0.59663866]
std_test_score: [ 0.00080287  0.00443338  0.00838411]
split2_test_score: [ 0.59493671  0.59493671  0.59493671]
std_fit_time: [ 0.00398653  0.00125181  0.00037104]
mean_train_score: [ 0.59550522  1.          1.        ]
mean_test_score: [ 0.59550562  0.59269663  0.58988764]
split1_train_score: [ 0.59578947  1.          1.        ]
std_train_score: [ 0.000402  0.        0.      ]
rank_test_score: [1 2 3]
mean_score_time: [ 0.00313775  0.00228906  0.00201567]
params: ({'C': 0.1}, {'C': 1}, {'C': 10})
split2_train_score: [ 0.59578947  1.          1.        ]
std_score_time: [  6.27365057e-04   3.31466369e-04   1.15828796e-05]
mean_fit_time: [ 0.01238529  0.0093116   0.00955502]

best estimator:
SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None

In [33]:
from sklearn.model_selection import cross_val_score

svc = SVC(C=0.1)
score = cross_val_score(svc, X, y, cv=5)
score

array([ 0.59440559,  0.59440559,  0.59440559,  0.59859155,  0.59574468])

In [37]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
estimator = GridSearchCV(rfc, {'n_estimators':[5, 10, 20], 'criterion':['gini', 'entropy']})
estimator.fit(X, y)

print('GridSearchCV result:')
for key in estimator.cv_results_:
    print(key, ': ', estimator.cv_results_[key], sep='')

print()
print('best estimator:')
print(estimator.best_estimator_)

GridSearchCV result:
split1_test_score: [ 0.75949367  0.78902954  0.76371308  0.70886076  0.76793249  0.76371308]
split0_train_score: [ 0.97257384  0.98523207  0.9978903   0.97046414  1.          0.99578059]
params: ({'criterion': 'gini', 'n_estimators': 5}, {'criterion': 'gini', 'n_estimators': 10}, {'criterion': 'gini', 'n_estimators': 20}, {'criterion': 'entropy', 'n_estimators': 5}, {'criterion': 'entropy', 'n_estimators': 10}, {'criterion': 'entropy', 'n_estimators': 20})
param_n_estimators: [5 10 20 5 10 20]
split0_test_score: [ 0.74789916  0.62605042  0.65546218  0.71848739  0.66806723  0.72268908]
std_test_score: [ 0.00990747  0.08003531  0.07086535  0.03374707  0.05540263  0.03398631]
split2_test_score: [ 0.7721519   0.80168776  0.82700422  0.78481013  0.79746835  0.80590717]
std_fit_time: [ 0.0005407   0.00567891  0.0013495   0.00109993  0.00036423  0.00020458]
mean_train_score: [ 0.96770005  0.98314753  0.99438448  0.97261085  0.99298246  0.99578651]
param_criterion: ['gini'

In [47]:
from sklearn.model_selection import cross_val_score

rfc = RandomForestClassifier(n_estimators=10, criterion='gini')
score = cross_val_score(rfc, X, y, cv=5)
score

array([ 0.6013986 ,  0.8041958 ,  0.79020979,  0.8028169 ,  0.82978723])