# To improve Titanic accuracy
### --By Jiancheng
Work on [Kaggle Titanic](https://www.kaggle.com/c/titanic)

Start on 2016/03/25

# Summary:
1. Data processing module changes
1. Testing separate model on SVC, GBC, etc
1. Voting test

In [1]:
import pandas as pd
import numpy as np
import pylab as plt

In [2]:
raw_dtrain = pd.read_csv('data/train.csv')
raw_dtest = pd.read_csv('data/test.csv')

In [25]:
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [4]:
raw_dtrain.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


## Add new features into data processing module:
1. Delete 'Family' features, replace with the original features 'SibSp' and 'Parch'
1. Normalizing the features on the training and testing data set
1. Delete 'Title feature'

In [5]:
def process_data(in_df, mean_train = None, std_train = None, training = False):
    df = in_df.copy()

    age_average = {' Major': 48.5, ' the Countess': 33.0, ' Don': 40.0, ' Sir': 49.0, ' Miss': 21.773972602739725, 
                   ' Mlle': 24.0, ' Mrs': 35.898148148148145, ' Capt': 70.0, ' Rev': 43.166666666666664, ' Dr': 42.0, 
                   ' Master': 4.5741666666666667, ' Mr': 32.368090452261306, ' Ms': 28.0, ' Jonkheer': 38.0, 
                   ' Col': 58.0, ' Lady': 48.0, ' Mme': 24.0, ' Dona': 39}
    title_convert = {' Major': 'Army', ' the Countess': 'Upper', ' Don': 'Mr', ' Miss': 'Miss', ' Sir': 'Upper', ' Mlle': 'Upper', 
                        ' Mrs': 'Mrs', ' Capt': 'Upper', ' Rev': 'Rev', ' Dr': 'Dr', ' Master': 'Master', ' Mr': 'Mr', ' Ms': 'Miss', 
                        ' Jonkheer': 'Upper', ' Col': 'Army', ' Lady': 'Upper', ' Mme': 'Upper', ' Dona': 'Upper'}
    
    # feature transformation
#     df['Family'] = df['SibSp'] + df['Parch']
    df['orgTitle'] = df['Name'].map(lambda x: x.split(',')[1].split('.')[0]) # extract "Title" from "Name"
    df['Title'] = df['orgTitle'].map(lambda x: title_convert[x]) # then also merge some rare Title into commom ones 
    df['Cabin'] = df['Cabin'].map(lambda x: str(x)[0])
    df['Cabin'] = df['Cabin'].map(lambda x: x if x != 'T' else 'n')
    df['Sex'] = df['Sex'].map(lambda x: 0 if x == 'male' else 1) # male: 0 female: 1
    
    # deal with NaN and 0
    df['Fare'] = df['Fare'].groupby(df['Pclass']).apply(lambda g: g.fillna(g.mean())) # the average Pclass fare
    df['Fare'] = df['Fare'].groupby(df['Pclass']).apply(lambda g: g.replace(0, g.mean())) # the average Pclass fare
    df['Embarked'] = df['Embarked'].fillna('n')  # the most frequent item
    df['Age'] = df['Age'].groupby(df['orgTitle']).apply(lambda g: g.fillna(age_average[g.name])) # average age of Title
    
    # normalization
    if training:
        mean_train = df[['Age','SibSp','Parch','Fare']].mean()
        std_train = df[['Age','SibSp','Parch','Fare']].std()
        
    df[['Age','SibSp','Parch','Fare']]= (df[['Age','SibSp','Parch','Fare']]- mean_train) / std_train
    
    
    # transfer category feature into dummy feature   
    df = pd.concat([df, pd.get_dummies(df['Embarked'], prefix='Embarked')], axis=1)
    df = pd.concat([df, pd.get_dummies(df['Pclass'], prefix='Pclass')], axis=1)
    df = pd.concat([df, pd.get_dummies(df['Cabin'], prefix='Cabin')], axis=1)
    df = pd.concat([df, pd.get_dummies(df['Title'], prefix='Title')], axis=1)
    
    # drop features we don't need 
    df = df.drop(['orgTitle'], axis = 1)
#     df = df.drop(['Embarked', 'Name', 'SibSp', 'Parch', 'Ticket', 'PassengerId', 'Pclass', 'Cabin','Title'], axis = 1)  
    df = df.drop(['Embarked', 'Name', 'Ticket', 'PassengerId', 'Pclass', 'Cabin','Title'], axis = 1) 
    return df, mean_train, std_train

In [6]:
raw_dtrain = pd.read_csv('data/train.csv')
raw_dtest = pd.read_csv('data/test.csv')
dtrain, mean_train, std_train = process_data(raw_dtrain, training = True)
print dtrain.isnull().sum()
dtrain.head()

Survived        0
Sex             0
Age             0
SibSp           0
Parch           0
Fare            0
Embarked_C      0
Embarked_Q      0
Embarked_S      0
Embarked_n      0
Pclass_1        0
Pclass_2        0
Pclass_3        0
Cabin_A         0
Cabin_B         0
Cabin_C         0
Cabin_D         0
Cabin_E         0
Cabin_F         0
Cabin_G         0
Cabin_n         0
Title_Army      0
Title_Dr        0
Title_Master    0
Title_Miss      0
Title_Mr        0
Title_Mrs       0
Title_Rev       0
Title_Upper     0
dtype: int64


Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Embarked_n,...,Cabin_G,Cabin_n,Title_Army,Title_Dr,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rev,Title_Upper
0,0,0,-0.584059,0.43255,-0.473408,-0.515736,0,0,1,0,...,0,1,0,0,0,0,1,0,0,0
1,1,1,0.621016,0.43255,-0.473408,0.772917,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1,1,-0.28279,-0.474279,-0.473408,-0.502152,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0
3,1,1,0.395064,0.43255,-0.473408,0.406983,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0.395064,-0.474279,-0.473408,-0.499636,0,0,1,0,...,0,1,0,0,0,0,1,0,0,0


In [7]:
mean_train

Age      29.754659
SibSp     0.523008
Parch     0.381594
Fare     32.876990
dtype: float64

In [8]:
std_train

Age      13.277179
SibSp     1.102743
Parch     0.806057
Fare     49.690114
dtype: float64

In [9]:
dtest, mean_train, std_train = process_data(raw_dtest, mean_train, std_train)

In [10]:
mean_train

Age      29.754659
SibSp     0.523008
Parch     0.381594
Fare     32.876990
dtype: float64

In [11]:
std_train

Age      13.277179
SibSp     1.102743
Parch     0.806057
Fare     49.690114
dtype: float64

In [12]:
dtest.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,...,Cabin_G,Cabin_n,Title_Army,Title_Dr,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rev,Title_Upper
0,0,0.357406,-0.474279,-0.473408,-0.50408,0,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0
1,1,1.298871,0.43255,-0.473408,-0.520767,0,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
2,0,2.428629,-0.474279,-0.473408,-0.466682,0,1,0,0,1,...,0,1,0,0,0,0,1,0,0,0
3,0,-0.207473,-0.474279,-0.473408,-0.48731,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0
4,1,-0.584059,0.43255,0.767199,-0.414358,0,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0


In [13]:
def test_model(model, data, cv = 10):
    X = data.drop(['Survived'], axis = 1)
    y = data['Survived']
    model.fit(X, y)
    training = model.score(X, y)
    validation = cross_validation.cross_val_score(model, X, y, cv=cv).mean()
    print 'Training accuracy:\t\t\t', training
    print '%s-fold cross-validation accuracy:\t' % cv, validation
    print 'Delta(training - validation): \t\t', training - validation
    return model

In [14]:
test_model(RandomForestClassifier(n_estimators=300, max_depth=3), dtrain)

Training accuracy:			0.822671156004
10-fold cross-validation accuracy:	0.815950800136
Delta(training - validation): 		0.0067203558683


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [15]:
for depth in np.arange(1,25,1): 
    print '=========Test on depth = %s=========' % depth
    test_model(RandomForestClassifier(n_estimators=300, max_depth=depth), dtrain)

Training accuracy:			0.721661054994
10-fold cross-validation accuracy:	0.724038701623
Delta(training - validation): 		-0.00237764662858
Training accuracy:			0.799102132435
10-fold cross-validation accuracy:	0.777997957099
Delta(training - validation): 		0.0211041753364
Training accuracy:			0.802469135802
10-fold cross-validation accuracy:	0.794764215186
Delta(training - validation): 		0.00770492061691
Training accuracy:			0.848484848485
10-fold cross-validation accuracy:	0.816138917263
Delta(training - validation): 		0.0323459312223
Training accuracy:			0.854096520763
10-fold cross-validation accuracy:	0.825002837362
Delta(training - validation): 		0.0290936834008
Training accuracy:			0.867564534231
10-fold cross-validation accuracy:	0.831756894791
Delta(training - validation): 		0.0358076394406
Training accuracy:			0.882154882155
10-fold cross-validation accuracy:	0.828423845194
Delta(training - validation): 		0.0537310369614
Training accuracy:			0.911335578002
10-fold cross-validatio

In [16]:
for depth in np.arange(5,17,1): 
    print '=========Test on depth = %s=========' % depth
    test_model(ExtraTreesClassifier(n_estimators=300, max_depth=depth), dtrain)

Training accuracy:			0.841750841751
10-fold cross-validation accuracy:	0.811581545795
Delta(training - validation): 		0.0301692959558
Training accuracy:			0.848484848485
10-fold cross-validation accuracy:	0.819396776756
Delta(training - validation): 		0.0290880717285
Training accuracy:			0.859708193042
10-fold cross-validation accuracy:	0.820482635342
Delta(training - validation): 		0.0392255576993
Training accuracy:			0.878787878788
10-fold cross-validation accuracy:	0.821656452162
Delta(training - validation): 		0.0571314266258
Training accuracy:			0.893378226712
10-fold cross-validation accuracy:	0.823916411304
Delta(training - validation): 		0.0694618154075
Training accuracy:			0.905723905724
10-fold cross-validation accuracy:	0.822805300193
Delta(training - validation): 		0.082918605531
Training accuracy:			0.92480359147
10-fold cross-validation accuracy:	0.819434513676
Delta(training - validation): 		0.105369077794
Training accuracy:			0.937149270483
10-fold cross-validation accu

In [17]:
for depth in np.arange(5,17,1): 
    print '=========Test on depth = %s=========' % depth
    test_model(RandomForestClassifier(n_estimators=300, max_depth=depth, criterion='entropy'), dtrain)

Training accuracy:			0.859708193042
10-fold cross-validation accuracy:	0.821657019635
Delta(training - validation): 		0.038051173407
Training accuracy:			0.86531986532
10-fold cross-validation accuracy:	0.826176370446
Delta(training - validation): 		0.0391434948738
Training accuracy:			0.879910213244
10-fold cross-validation accuracy:	0.837400124844
Delta(training - validation): 		0.0425100883996
Training accuracy:			0.902356902357
10-fold cross-validation accuracy:	0.827312450346
Delta(training - validation): 		0.0750444520107
Training accuracy:			0.915824915825
10-fold cross-validation accuracy:	0.825039723073
Delta(training - validation): 		0.0907851927515
Training accuracy:			0.93265993266
10-fold cross-validation accuracy:	0.829534105096
Delta(training - validation): 		0.103125827564
Training accuracy:			0.948372615039
10-fold cross-validation accuracy:	0.821681704687
Delta(training - validation): 		0.126690910352
Training accuracy:			0.95847362514
10-fold cross-validation accurac

In [18]:
for n in np.arange(100,700,50): 
    print '=========Test on n_estimators = %s=========' % n
    test_model(GradientBoostingClassifier(n_estimators=n), dtrain)

Training accuracy:			0.890011223345
10-fold cross-validation accuracy:	0.823853705595
Delta(training - validation): 		0.0661575177493
Training accuracy:			0.907968574635
10-fold cross-validation accuracy:	0.833979117013
Delta(training - validation): 		0.0739894576224
Training accuracy:			0.927048260382
10-fold cross-validation accuracy:	0.82948473499
Delta(training - validation): 		0.0975635253912
Training accuracy:			0.943883277217
10-fold cross-validation accuracy:	0.829497219385
Delta(training - validation): 		0.114386057832
Training accuracy:			0.951739618406
10-fold cross-validation accuracy:	0.827212291454
Delta(training - validation): 		0.124527326952
Training accuracy:			0.955106621773
10-fold cross-validation accuracy:	0.821594313926
Delta(training - validation): 		0.133512307848
Training accuracy:			0.95847362514
10-fold cross-validation accuracy:	0.820508171604
Delta(training - validation): 		0.137965453537
Training accuracy:			0.962962962963
10-fold cross-validation accurac

In [80]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
Xtrain = dtrain.drop(['Survived'], axis = 1)
ytrain = dtrain['Survived']

In [22]:
dtrain

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Embarked_n,...,Cabin_G,Cabin_n,Title_Army,Title_Dr,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rev,Title_Upper
0,0,0,-0.584059,0.432550,-0.473408,-0.515736,0,0,1,0,...,0,1,0,0,0,0,1,0,0,0
1,1,1,0.621016,0.432550,-0.473408,0.772917,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1,1,-0.282790,-0.474279,-0.473408,-0.502152,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0
3,1,1,0.395064,0.432550,-0.473408,0.406983,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0.395064,-0.474279,-0.473408,-0.499636,0,0,1,0,...,0,1,0,0,0,0,1,0,0,0
5,0,0,0.196836,-0.474279,-0.473408,-0.491419,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0
6,0,0,1.826091,-0.474279,-0.473408,0.382078,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
7,0,0,-2.090403,2.246209,0.767199,-0.237512,0,0,1,0,...,0,1,0,0,1,0,0,0,0,0
8,1,1,-0.207473,-0.474279,2.007806,-0.437586,0,0,1,0,...,0,1,0,0,0,0,0,1,0,0
9,1,1,-1.186597,0.432550,-0.473408,-0.056474,1,0,0,0,...,0,1,0,0,0,0,0,1,0,0


In [None]:
clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5)
clf.fit(Xtrain, ytrain)

In [18]:
for c in [0.0001,0.001,0.01,0.1,1,3,10,30,100, 300]: 
    print '=========Test on c = %s=========' % c
    test_model(SVC(C=c), dtrain)

Training accuracy:			0.616161616162
10-fold cross-validation accuracy:	0.616170128249
Delta(training - validation): 		-8.51208716379e-06
Training accuracy:			0.616161616162
10-fold cross-validation accuracy:	0.616170128249
Delta(training - validation): 		-8.51208716379e-06
Training accuracy:			0.616161616162
10-fold cross-validation accuracy:	0.616170128249
Delta(training - validation): 		-8.51208716379e-06
Training accuracy:			0.821548821549
10-fold cross-validation accuracy:	0.815976052661
Delta(training - validation): 		0.00557276888738
Training accuracy:			0.837261503928
10-fold cross-validation accuracy:	0.833878674384
Delta(training - validation): 		0.00338282954388
Training accuracy:			0.840628507295
10-fold cross-validation accuracy:	0.835015038021
Delta(training - validation): 		0.00561346927452
Training accuracy:			0.855218855219
10-fold cross-validation accuracy:	0.839547156963
Delta(training - validation): 		0.015671698256
Training accuracy:			0.870931537598
10-fold cross-v

In [19]:
test_model(SVC(), dtrain)

Training accuracy:			0.837261503928
10-fold cross-validation accuracy:	0.833878674384
Delta(training - validation): 		0.00338282954388


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [17]:
test_model(SVC(kernel='rbf', C=0.1), dtrain)

Training accuracy:			0.821548821549
10-fold cross-validation accuracy:	0.815976052661
Delta(training - validation): 		0.00557276888738


SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [None]:
for c in [0.0001,0.001,0.01,0.1,1,3,10,30]: 
    print '=========Test on c = %s=========' % c
    test_model(SVC(C=c, kernel='linear'), dtrain)

Training accuracy:			0.616161616162
10-fold cross-validation accuracy:	0.616170128249
Delta(training - validation): 		-8.51208716379e-06
Training accuracy:			0.631874298541
10-fold cross-validation accuracy:	0.620652025877
Delta(training - validation): 		0.0112222726642
Training accuracy:			0.805836139169
10-fold cross-validation accuracy:	0.804701793213
Delta(training - validation): 		0.00113434595644
Training accuracy:			0.83164983165
10-fold cross-validation accuracy:	0.822667404381
Delta(training - validation): 		0.00898242726894
Training accuracy:			0.838383838384
10-fold cross-validation accuracy:	0.820445182159
Delta(training - validation): 		0.0179386562252
Training accuracy:			0.836139169473
10-fold cross-validation accuracy:	0.821581545795
Delta(training - validation): 		0.0145576236775
Training accuracy:			

In [23]:
for gamma in [0.0001,0.001,0.01,0.1,1,3,10,30,100, 300]: 
    print '=========Test on c = %s=========' % c
    test_model(SVC(gamma=gamma, C = 10), dtrain)

Training accuracy:			0.717171717172
10-fold cross-validation accuracy:	0.701553172171
Delta(training - validation): 		0.0156185450006
Training accuracy:			0.812570145903
10-fold cross-validation accuracy:	0.802442117807
Delta(training - validation): 		0.0101280280962
Training accuracy:			0.835016835017
10-fold cross-validation accuracy:	0.835015038021
Delta(training - validation): 		1.79699617897e-06
Training accuracy:			0.883277216611
10-fold cross-validation accuracy:	0.821682555896
Delta(training - validation): 		0.0615946607145
Training accuracy:			0.914702581369
10-fold cross-validation accuracy:	0.796948984224
Delta(training - validation): 		0.117753597145
Training accuracy:			0.925925925926
10-fold cross-validation accuracy:	0.74873340143
Delta(training - validation): 		0.177192524496
Training accuracy:			0.945005611672
10-fold cross-validation accuracy:	0.727472193849
Delta(training - validation): 		0.217533417824
Training accuracy:			0.95847362514
10-fold cross-validation accu

In [24]:
for gamma in np.arange(0.001, 0.5, 0.002): 
    print '=========Test on gamma = %s=========' % gamma
    test_model(SVC(gamma=gamma, C = 10), dtrain)

Training accuracy:			0.812570145903
10-fold cross-validation accuracy:	0.802442117807
Delta(training - validation): 		0.0101280280962
Training accuracy:			0.833894500561
10-fold cross-validation accuracy:	0.826025990239
Delta(training - validation): 		0.00786851032169
Training accuracy:			0.835016835017
10-fold cross-validation accuracy:	0.830507887867
Delta(training - validation): 		0.0045089471494
Training accuracy:			0.835016835017
10-fold cross-validation accuracy:	0.832755078879
Delta(training - validation): 		0.00226175613816
Training accuracy:			0.835016835017
10-fold cross-validation accuracy:	0.835015038021
Delta(training - validation): 		1.79699617897e-06
Training accuracy:			0.836139169473
10-fold cross-validation accuracy:	0.836126149132
Delta(training - validation): 		1.30203407356e-05
Training accuracy:			0.837261503928
10-fold cross-validation accuracy:	0.833891442515
Delta(training - validation): 		0.00337006141313
Training accuracy:			0.837261503928
10-fold cross-valid

In [32]:
dataset = pd.concat([raw_dtrain, raw_dtest])

In [33]:
dataset.shape

(1309, 12)

In [34]:
raw_dtrain.mean()

PassengerId    446.000000
Survived         0.383838
Pclass           2.308642
Age             29.699118
SibSp            0.523008
Parch            0.381594
Fare            32.204208
dtype: float64

In [35]:
raw_dtrain.std()

PassengerId    257.353842
Survived         0.486592
Pclass           0.836071
Age             14.526497
SibSp            1.102743
Parch            0.806057
Fare            49.693429
dtype: float64

In [38]:
raw_dtest.mean()

PassengerId    1100.500000
Pclass            2.265550
Age              30.272590
SibSp             0.447368
Parch             0.392344
Fare             35.627188
dtype: float64

In [37]:
raw_dtest.std()

PassengerId    120.810458
Pclass           0.841838
Age             14.181209
SibSp            0.896760
Parch            0.981429
Fare            55.907576
dtype: float64

In [44]:
set(dataset['Ticket'].map(lambda x: x.split()[0].strip('.')))

{'110152',
 '110413',
 '110465',
 '110469',
 '110489',
 '110564',
 '110813',
 '111163',
 '111240',
 '111320',
 '111361',
 '111369',
 '111426',
 '111427',
 '111428',
 '112050',
 '112051',
 '112052',
 '112053',
 '112058',
 '112059',
 '112277',
 '112377',
 '112378',
 '112379',
 '112901',
 '113028',
 '113038',
 '113043',
 '113044',
 '113050',
 '113051',
 '113054',
 '113055',
 '113056',
 '113059',
 '113501',
 '113503',
 '113505',
 '113509',
 '113510',
 '113514',
 '113572',
 '113760',
 '113767',
 '113773',
 '113776',
 '113778',
 '113780',
 '113781',
 '113783',
 '113784',
 '113786',
 '113787',
 '113788',
 '113789',
 '113790',
 '113791',
 '113792',
 '113794',
 '113795',
 '113796',
 '113798',
 '113800',
 '113801',
 '113803',
 '113804',
 '113806',
 '113807',
 '11668',
 '11751',
 '11752',
 '11753',
 '11755',
 '11765',
 '11767',
 '11769',
 '11770',
 '11771',
 '11774',
 '11778',
 '11813',
 '11967',
 '1222',
 '12233',
 '12460',
 '12749',
 '13049',
 '13050',
 '13213',
 '13214',
 '13236',
 '13502',
 '

In [80]:
>>> from sklearn import svm, grid_search, datasets
>>> iris = datasets.load_iris()
>>> parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
>>> svr = svm.SVC()
>>> clf = grid_search.GridSearchCV(svr, parameters)


In [73]:
param =  [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

In [82]:
>>> clf = grid_search.GridSearchCV(svr, param, n_jobs=-1)
>>> clf.fit(iris.data, iris.target)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'kernel': ['rbf'], 'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}],
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [75]:
clf.get_params()

{'cv': None,
 'error_score': 'raise',
 'estimator': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
   max_iter=-1, probability=False, random_state=None, shrinking=True,
   tol=0.001, verbose=False),
 'estimator__C': 1.0,
 'estimator__cache_size': 200,
 'estimator__class_weight': None,
 'estimator__coef0': 0.0,
 'estimator__decision_function_shape': None,
 'estimator__degree': 3,
 'estimator__gamma': 'auto',
 'estimator__kernel': 'rbf',
 'estimator__max_iter': -1,
 'estimator__probability': False,
 'estimator__random_state': None,
 'estimator__shrinking': True,
 'estimator__tol': 0.001,
 'estimator__verbose': False,
 'fit_params': {},
 'iid': True,
 'n_jobs': -1,
 'param_grid': [{'C': [1, 10, 100, 1000],
   'gamma': [0.001, 0.0001],
   'kernel': ['rbf']},
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']}],
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'scoring': None,
 'verbose': 0}

In [83]:
report(clf.grid_scores_)

Model with rank: 1
Mean validation score: 0.987 (std: 0.018)
Parameters: {'kernel': 'rbf', 'C': 1000, 'gamma': 0.001}

Model with rank: 2
Mean validation score: 0.980 (std: 0.016)
Parameters: {'kernel': 'linear', 'C': 1}

Model with rank: 3
Mean validation score: 0.973 (std: 0.009)
Parameters: {'kernel': 'rbf', 'C': 100, 'gamma': 0.001}



In [77]:
clf.best_params_

{'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}

In [61]:
clf.scorer_

['__call__',
 '__class__',
 '__closure__',
 '__code__',
 '__defaults__',
 '__delattr__',
 '__dict__',
 '__doc__',
 '__format__',
 '__get__',
 '__getattribute__',
 '__globals__',
 '__hash__',
 '__init__',
 '__module__',
 '__name__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'func_closure',
 'func_code',
 'func_defaults',
 'func_dict',
 'func_doc',
 'func_globals',
 'func_name']

In [65]:
clf.scorer_.__doc__

'Function that wraps estimator.score'

In [66]:
print(__doc__)

import numpy as np

from time import time
from operator import itemgetter
from scipy.stats import randint as sp_randint

from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier

# get some data
digits = load_digits()
X, y = digits.data, digits.target

# build a classifier
clf = RandomForestClassifier(n_estimators=20)


# Utility function to report best scores
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")


# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(1, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.grid_scores_)

# use a full grid over all parameters
param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [1, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)
start = time()
grid_search.fit(X, y)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.grid_scores_)))
report(grid_search.grid_scores_)

Automatically created module for IPython interactive environment
RandomizedSearchCV took 4.03 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.922 (std: 0.014)
Parameters: {'bootstrap': False, 'min_samples_leaf': 5, 'min_samples_split': 2, 'criterion': 'gini', 'max_features': 7, 'max_depth': None}

Model with rank: 2
Mean validation score: 0.920 (std: 0.010)
Parameters: {'bootstrap': False, 'min_samples_leaf': 6, 'min_samples_split': 5, 'criterion': 'gini', 'max_features': 8, 'max_depth': None}

Model with rank: 3
Mean validation score: 0.912 (std: 0.015)
Parameters: {'bootstrap': True, 'min_samples_leaf': 4, 'min_samples_split': 1, 'criterion': 'gini', 'max_features': 5, 'max_depth': None}

GridSearchCV took 36.10 seconds for 216 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.937 (std: 0.004)
Parameters: {'bootstrap': False, 'min_samples_leaf': 1, 'min_samples_split': 1, 'criterion': 'gini', 'max_features': 10, 'max_d

In [85]:
grid_search.score(X,y)

AttributeError: 'module' object has no attribute 'score'