In [73]:
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import RandomForestClassifier
import pickle
import os

In [5]:
wine_dataset = load_wine()
X = pd.DataFrame(wine_dataset.get('data'))
X.columns = wine_dataset.get('feature_names')
y = wine_dataset.get('target')

In [7]:
X.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


### Filtering Columns

In [8]:
class RawFeats:
    def __init__(self, feats):
        self.feats = feats

    def fit(self, X, y=None):
        pass

    def transform(self, X, y=None):
        return X[self.feats]

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

In [9]:
X.columns

Index(['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium',
       'total_phenols', 'flavanoids', 'nonflavanoid_phenols',
       'proanthocyanins', 'color_intensity', 'hue',
       'od280/od315_of_diluted_wines', 'proline'],
      dtype='object')

In [10]:
# features we are keeping for PCA
feats = ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols']

In [11]:
raw_feats = RawFeats(feats)

In [18]:
# scaling and PCA
sc = StandardScaler()
pca = PCA(n_components=2)
selection = SelectKBest(k=4)
rf = RandomForestClassifier()

In [15]:
# combining everything into one pipeline
pca_pipeline = Pipeline(steps=[
    ('raw_feats', raw_feats),
    ('standard_scaler', sc),
    ('pca', pca)
])
kbest_pipeline = Pipeline(steps=[
    ('kBest', selection)
])

In [16]:
all_features = FeatureUnion(transformer_list=[
    ('pca_pipeline', pca_pipeline),
    ('kbest_pipeline', kbest_pipeline)
])

In [19]:
pipeline = Pipeline(steps=[
    ('features', all_features),
    ('rf', rf)
])

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4, test_size=0.3)

In [66]:
param_grid = [
    {
        'rf__max_depth': [6],
        'features__pca_pipeline__pca__n_components': [2],
        'rf__n_estimators': [100],
        'features__kbest_pipeline__kBest__k': [5]
    }
]
# best result from grid search
best_estimator = {
    'features__kbest_pipeline__kBest__k': 5,
    'features__pca_pipeline__pca__n_components': 2,
    'rf__max_depth': 6,
    'rf__n_estimators': 100
}

In [67]:
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=3, verbose=5, n_jobs=-1)

In [None]:
grid_search.fit(X_train, y_train)

In [74]:
# grid_search.best_params_

In [70]:
# grid_search.estimator.get_params().keys()

In [76]:
grid_search.score(X_test, y_test)

0.9814814814814815

In [79]:
dir = './data/'
fname = dir + 'trained_model.pkl'
try:
    os.mkdir(dir)
except FileExistsError:
    print(f'"{dir}" already exists...')

"./data/" already exists...


In [80]:
with open(fname, 'wb') as f:
    pickle.dump(grid_search, f)