## Advanced tuning of parameters

In this tutorial, we will apply skills from previous tutorials and build the classifier using Pipelines and FeatureUnion

In [13]:
# IMPORT PACKAGES
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split

from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import RandomForestClassifier

### Data
We will use data about diabetes. We will build a classifier that predicts whether person has a diabetes or no using information about his health. The dataset can be found [here](https://drive.google.com/file/d/1TvCKlmH3Z32XAKk-VUcZyYu95Ccyw3PO/view?usp=sharing).

In [3]:
col_names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = pd.read_csv('pima-indians-diabetes.csv',sep=";")
df.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [14]:
X, y = df[col_names[0:-1]], df[col_names[-1]]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=0)

### Task

Build classifier which predicts target variable `class` using rest of the attributes. The model should be fitted using pipeline that contains:
- PCA method
- SelectKBest method
- FeatureUnion
- Random Forest

Choose the best set of parameters using `Pipeline` and grid_search.

In [15]:
pca = PCA()
selection = SelectKBest()

combined_features = FeatureUnion([("pca",pca), ('univ_select', selection)])
rndmF = RandomForestClassifier()

In [16]:
pipeline = Pipeline([('features', combined_features), ('rndmF', rndmF)])

param_grid = {"features__pca__n_components":[1,2,3],
                'features__univ_select__k': [1,2,3],
                'rndmF__n_estimators':[25,50,100],
                'rndmF__max_depth':[None, 5, 10, 25],
                'rndmF__n_estimators':[25,50,100]
                }

gridsearch = GridSearchCV(pipeline,param_grid,verbose=5, refit=True)

gridsearch.fit(X_train,y_train)

 rndmF__n_estimators=50 
[CV]  features__pca__n_components=3, features__univ_select__k=2, rndmF__max_depth=10, rndmF__n_estimators=50, score=0.748, total=   0.1s
[CV] features__pca__n_components=3, features__univ_select__k=2, rndmF__max_depth=10, rndmF__n_estimators=100 
[CV]  features__pca__n_components=3, features__univ_select__k=2, rndmF__max_depth=10, rndmF__n_estimators=100, score=0.731, total=   0.1s
[CV] features__pca__n_components=3, features__univ_select__k=2, rndmF__max_depth=10, rndmF__n_estimators=100 
[CV]  features__pca__n_components=3, features__univ_select__k=2, rndmF__max_depth=10, rndmF__n_estimators=100, score=0.722, total=   0.2s
[CV] features__pca__n_components=3, features__univ_select__k=2, rndmF__max_depth=10, rndmF__n_estimators=100 
[CV]  features__pca__n_components=3, features__univ_select__k=2, rndmF__max_depth=10, rndmF__n_estimators=100, score=0.738, total=   0.2s
[CV] features__pca__n_components=3, features__univ_select__k=2, rndmF__max_depth=10, rndmF__n_

GridSearchCV(estimator=Pipeline(steps=[('features',
                                        FeatureUnion(transformer_list=[('pca',
                                                                        PCA()),
                                                                       ('univ_select',
                                                                        SelectKBest())])),
                                       ('rndmF', RandomForestClassifier())]),
             param_grid={'features__pca__n_components': [1, 2, 3],
                         'features__univ_select__k': [1, 2, 3],
                         'rndmF__max_depth': [None, 5, 10, 25],
                         'rndmF__n_estimators': [25, 50, 100]},
             verbose=5)

In [17]:
print(gridsearch.best_params_)
print('Final score is :', gridsearch.score(X_test,y_test))

{'features__pca__n_components': 2, 'features__univ_select__k': 3, 'rndmF__max_depth': 5, 'rndmF__n_estimators': 50}
Final score is : 0.7575757575757576
