In [1]:
import pandas as pd 
from sklearn.datasets import load_wine

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest

from sklearn.ensemble import RandomForestClassifier 

import pickle

In [2]:
data = load_wine()
df = pd.DataFrame(data['data'])
df.columns = data['feature_names']
y = data['target']
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


## Filter own columns

Create class to keep only features we want in our pipeline. To include classes into pipeline, they must have
- .fit()
- .transform()
- .fit_transform()

In [3]:
class RawFeats:
    def __init__(self,feats):
        self.feats = feats

    def fit(self, X, y=None):
        pass

    def transform(self,X,y=None):
        return X[self.feats]

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

# features we want to keep for PCA
feats = ['alcohol','malic_acid','ash','alcalinity_of_ash','magnesium',
         'total_phenols','flavanoids','nonflavanoid_phenols']
# creating class object with indexes we want to keep
raw_feats = RawFeats(feats)

## Scaling, PCA, SelectKBest, RandomForest

In [4]:
sc = StandardScaler()
pca = PCA(n_components=2)
selection = SelectKBest(k=4)
rf = RandomForestClassifier()

## Combining into one pipeline

In [6]:
PCA_pipeline = Pipeline([('raw_feats',raw_feats),('scaler',sc),('pca',pca)])
kbest_pipeline = Pipeline([('kBest', selection)])

all_features = FeatureUnion([('pcaPipeline',PCA_pipeline),('kBestPipeline',kbest_pipeline)])

main_pipeline = Pipeline([('features',all_features),('rf',rf)])

In [8]:
param_grid = {'features__pcaPipeline__pca__n_components':[1,2,3],
                'features__kBestPipeline__kBest__k':[1,2,3],
                'rf__n_estimators':[2,5,10],
                'rf__max_depth':[2,4,6]}

grid_search = GridSearchCV(main_pipeline,param_grid,n_jobs=-1,verbose=10,refit=True)

grid_search.fit(df,y)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1971s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0870s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1360s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done 100 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  

GridSearchCV(estimator=Pipeline(steps=[('features',
                                        FeatureUnion(transformer_list=[('pcaPipeline',
                                                                        Pipeline(steps=[('raw_feats',
                                                                                         <__main__.RawFeats object at 0x000001F503244550>),
                                                                                        ('scaler',
                                                                                         StandardScaler()),
                                                                                        ('pca',
                                                                                         PCA(n_components=2))])),
                                                                       ('kBestPipeline',
                                                                        Pipeline(steps=[('kBest',
               

In [9]:
print(grid_search.best_params_)

{'features__kBestPipeline__kBest__k': 3, 'features__pcaPipeline__pca__n_components': 2, 'rf__max_depth': 4, 'rf__n_estimators': 10}


In [10]:
pickle.dump(grid_search,open('model.p','wb'))