In [None]:
from collections import defaultdict, Counter
from itertools import product
from time import clock
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import scipy.sparse as sps
from scipy.linalg import pinv
from scipy.spatial import distance 

from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, FastICA
from sklearn.manifold import TSNE
from sklearn.random_projection import SparseRandomProjection, GaussianRandomProjection
from sklearn.cluster import KMeans as kmeans
from sklearn.mixture import GaussianMixture as GMM
from sklearn.feature_selection import mutual_info_classif as MIC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import adjusted_mutual_info_score as ami, accuracy_score as acc

# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.metrics import pairwise_distances
# from sklearn.svm import SVC

In [None]:
# Helpers
nn_arch= [(50, 50), (50,), (25,), (25, 25), (100, 25, 100)]
nn_reg = [10**-x for x in range(1, 5)]

def cluster_acc(Y, clusterLabels):
    assert (Y.shape == clusterLabels.shape)
    pred = np.empty_like(Y)
    for label in set(clusterLabels):
        mask = clusterLabels == label
        sub = Y[mask]
        target = Counter(sub).most_common(1)[0][0]
        pred[mask] = target
#    assert max(pred) == max(Y)
#    assert min(pred) == min(Y)    
    return acc(Y, pred)


class myGMM(GMM):
    def transform(self, X):
        return self.predict_proba(X)
        
        
def pairwiseDistCorr(X1, X2):
    assert X1.shape[0] == X2.shape[0]
    
    d1 = pairwise_distances(X1)
    d2 = pairwise_distances(X2)
    return np.corrcoef(d1.ravel(), d2.ravel())[0,1]

    
def aveMI(X, Y):    
    MI = MIC(X, Y) 
    return np.nanmean(MI)
    
  
def reconstructionError(projections, X):
    W = projections.components_
    if sps.issparse(W):
        W = W.todense()
    p = pinv(W)
    reconstructed = ((p@W)@(X.T)).T # Unproject projected data
    errors = np.square(X-reconstructed)
    return np.nanmean(errors)
    
    
        
# http://datascience.stackexchange.com/questions/6683/feature-selection-using-feature-importances-in-random-forests-with-scikit-learn          
class ImportanceSelect(BaseEstimator, TransformerMixin):
    def __init__(self, model, n=1):
         self.model = model
         self.n = n
    def fit(self, *args, **kwargs):
         self.model.fit(*args, **kwargs)
         return self
    def transform(self, X):
         return X[:,self.model.feature_importances_.argsort()[::-1][:self.n]]
                  
#http://stats.stackexchange.com/questions/90769/using-bic-to-estimate-the-number-of-k-in-kmeans    
def compute_bic(kmeans, X):
    """
    Computes the BIC metric for a given clusters
    Parameters:
    -----------------------------------------
    kmeans:  List of clustering object from scikit learn
    X     :  multidimension np array of data points
    Returns:
    -----------------------------------------
    BIC value
    """
    # assign centers and labels
    centers = [kmeans.cluster_centers_]
    labels  = kmeans.labels_
    #number of clusters
    m = kmeans.n_clusters
    # size of the clusters
    n = np.bincount(labels)
    #size of data set
    N, d = X.shape

    #compute variance for all clusters beforehand
    cl_var = (1.0 / (N - m) / d) * sum([sum(distance.cdist(X[np.where(labels == i)], [centers[0][i]], 'euclidean')**2) for i in range(m)])

    const_term = 0.5 * m * np.log(N) * (d+1)

    BIC = np.sum([n[i] * np.log(n[i]) -
               n[i] * np.log(N) -
             ((n[i] * d) / 2) * np.log(2*np.pi*cl_var) -
             ((n[i] - 1) * d/ 2) for i in range(m)]) - const_term

    return(BIC)    

In [None]:
# Data load and preprocessing

# Mushroom dataset
mushroom = pd.read_csv('mushroom.txt', header=None)
print('Dataset shape: ' + str(mushroom.shape))
mushroom.columns = ['type', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color','population', 'habitat']
mushroom['type_label'] = mushroom['type'].astype('category')
print('Mushroom types: ', str(mushroom['type_label'].cat.categories))
print('Labels balance: \n', mushroom['type_label'].value_counts()/mushroom['type_label'].size)
# Code 1 is for poisonous mushrooms

mushroom['type'] = mushroom['type_label'].cat.codes

# There are no null values, missing is represented as ? only for stalk root
print('Missing values for stalk root: '+ str(sum(mushroom['stalk-root']=='?')))

# We remove feature stalk-root due to high number of missing values
mushroom = pd.get_dummies(mushroom.drop(columns=['type_label','stalk-root']))
mushroom = mushroom.astype(float)
mushroom.type = mushroom.type.astype(int)

In [None]:
# Wine dataset

wine = pd.read_csv('winequality-white.csv', sep = ';')
print('Dataset shape: ', str(wine.shape))

# There are no null values in the dataset

# Creation of class for good vs bad wines (good when greater or equal than 7)
wine['quality_label'] = 'bad'
wine.loc[wine['quality']>=7, 'quality_label'] = 'good'
wine['quality_label'] = wine['quality_label'].astype('category')

print('Quality values: ', str(wine['quality_label'].cat.categories))
print('Labels balance: \n', wine['quality_label'].value_counts()/wine['quality_label'].size)

wine['quality_int'] = wine['quality']
wine['quality'] = wine['quality_label'].cat.codes

wine = wine.drop(['quality_label', 'quality_int'], axis=1)
wine = wine.astype(float)
wine.quality = wine.quality.astype(int)

In [None]:
# Load Data       
mushroomX = mushroom.drop('type', axis=1).values
mushroomY = mushroom['type'].values

wineX = wine.drop('quality', axis=1).values
wineY = wine['quality'].values

mushroomX = StandardScaler().fit_transform(mushroomX)
wineX = StandardScaler().fit_transform(wineX)

mushroom_trgX, mushroom_tstX, mushroom_trgY, mushroom_tstY = ms.train_test_split(mushroomX, mushroomY, test_size=0.3, random_state=0, stratify=mushroomY)     
wine_trgX, wine_tstX, wine_trgY, wine_tstY = ms.train_test_split(wineX, wineY, test_size=0.3, random_state=0, stratify=wineY)

clusters =  [2, 5, 10, 15, 20, 25, 30, 35, 40]
dims = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]

In [None]:
# PCA
out = './PCA/'
cmap = cm.get_cmap('Spectral') 

np.random.seed(0)

#%% data for 1

pca = PCA(random_state=5)
pca.fit(mushroomX)
tmp = pd.Series(data=pca.explained_variance_, index = range(1, 501))
tmp.to_csv(out+'mushroom scree.csv')

pca = PCA(random_state=5)
pca.fit(wineX)
tmp = pd.Series(data=pca.explained_variance_, index = range(1, 65))
tmp.to_csv(out+'wine scree.csv')


#%% Data for 2
grid ={'pca__n_components': dims, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch}
pca = PCA(random_state=5)       
mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5)
pipe = Pipeline([('pca', pca), ('NN', mlp)])
gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

gs.fit(mushroomX, mushroomY)
tmp = pd.DataFrame(gs.cv_results_)
tmp.to_csv(out+'mushroom dim red.csv')

grid = {'pca__n_components': dims, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch}
pca = PCA(random_state=5)       
mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5)
pipe = Pipeline([('pca', pca),('NN', mlp)])
gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

gs.fit(wineX, wineY)
tmp = pd.DataFrame(gs.cv_results_)
tmp.to_csv(out+'wine dim red.csv')
raise
#%% data for 3
# Set this from chart 2 and dump, use clustering script to finish up
dim = 5
pca = PCA(n_components=dim, random_state=10)

mushroomX2 = pca.fit_transform(mushroomX)
mushroom2 = pd.DataFrame(np.hstack((mushroomX2, np.atleast_2d(mushroomY).T)))
cols = list(range(mushroom2.shape[1]))
cols[-1] = 'Class'
mushroom2.columns = cols
mushroom2.to_hdf(out+'datasets.hdf', 'mushroom', complib='blosc', complevel=9)

dim = 60
pca = PCA(n_components=dim, random_state=10)
wineX2 = pca.fit_transform(wineX)
wine2 = pd.DataFrame(np.hstack((wineX2, np.atleast_2d(wineY).T)))
cols = list(range(wine2.shape[1]))
cols[-1] = 'Class'
wine2.columns = cols
wine2.to_hdf(out+'datasets.hdf', 'wine', complib='blosc', complevel=9)

In [None]:
# ICA
out = './ICA/'

np.random.seed(0)


#raise
#%% data for 1

ica = FastICA(random_state=5)
kurt = {}
for dim in dims:
    ica.set_params(n_components=dim)
    tmp = ica.fit_transform(mushroomX)
    tmp = pd.DataFrame(tmp)
    tmp = tmp.kurt(axis=0)
    kurt[dim] = tmp.abs().mean()

kurt = pd.Series(kurt) 
kurt.to_csv(out+'mushroom scree.csv')


ica = FastICA(random_state=5)
kurt = {}
for dim in dims:
    ica.set_params(n_components=dim)
    tmp = ica.fit_transform(wineX)
    tmp = pd.DataFrame(tmp)
    tmp = tmp.kurt(axis=0)
    kurt[dim] = tmp.abs().mean()

kurt = pd.Series(kurt) 
kurt.to_csv(out+'wine scree.csv')
raise

#%% Data for 2

grid = {'ica__n_components': dims, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch}
ica = FastICA(random_state=5)       
mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5)
pipe = Pipeline([('ica', ica), ('NN', mlp)])
gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

gs.fit(mushroomX, mushroomY)
tmp = pd.DataFrame(gs.cv_results_)
tmp.to_csv(out+'Mushroom dim red.csv')


grid = {'ica__n_components': dims, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch}
ica = FastICA(random_state=5)       
mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5)
pipe = Pipeline([('ica', ica), ('NN', mlp)])
gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

gs.fit(wineX, wineY)
tmp = pd.DataFrame(gs.cv_results_)
tmp.to_csv(out+'wine dim red.csv')
raise
#%% data for 3
# Set this from chart 2 and dump, use clustering script to finish up
dim = 45
ica = FastICA(n_components=dim, random_state=10)

mushroomX2 = ica.fit_transform(mushroomX)
mushroom2 = pd.DataFrame(np.hstack((mushroomX2,np.atleast_2d(mushroomY).T)))
cols = list(range(mushroom2.shape[1]))
cols[-1] = 'Class'
mushroom2.columns = cols
mushroom2.to_hdf(out+'datasets.hdf', 'mushroom', complib='blosc', complevel=9)

dim = 60
ica = FastICA(n_components=dim, random_state=10)
wineX2 = ica.fit_transform(wineX)
wine2 = pd.DataFrame(np.hstack((wineX2, np.atleast_2d(wineY).T)))
cols = list(range(wine2.shape[1]))
cols[-1] = 'Class'
wine2.columns = cols
wine2.to_hdf(out+'datasets.hdf', 'wine', complib='blosc', complevel=9)

In [None]:
# Random Forest
out = './RF/'

np.random.seed(0)

#%% data for 1

rfc = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=5, n_jobs=7)
fs_mushroom = rfc.fit(mushroomX, mushroomY).feature_importances_ 
fs_wine = rfc.fit(wineX, wineY).feature_importances_ 

tmp = pd.Series(np.sort(fs_mushroom)[::-1])
tmp.to_csv(out+'mushroom scree.csv')

tmp = pd.Series(np.sort(fs_wine)[::-1])
tmp.to_csv(out+'wine scree.csv')

#%% Data for 2
filtr = ImportanceSelect(rfc)
grid = {'filter__n': dims, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch}
mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5)
pipe = Pipeline([('filter', filtr), ('NN', mlp)])
gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

gs.fit(mushroomX, mushroomY)
tmp = pd.DataFrame(gs.cv_results_)
tmp.to_csv(out+'mushroom dim red.csv')


grid = {'filter__n': dims, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch}  
mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5)
pipe = Pipeline([('filter', filtr), ('NN', mlp)])
gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

gs.fit(wineX, wineY)
tmp = pd.DataFrame(gs.cv_results_)
tmp.to_csv(out+'wine dim red.csv')
#    raise
#%% data for 3
# Set this from chart 2 and dump, use clustering script to finish up
dim = 20
filtr = ImportanceSelect(rfc, dim)

mushroomX2 = filtr.fit_transform(mushroomX, mushroomY)
mushroom2 = pd.DataFrame(np.hstack((mushroomX2, np.atleast_2d(mushroomY).T)))
cols = list(range(mushroom2.shape[1]))
cols[-1] = 'Class'
mushroom2.columns = cols
mushroom2.to_hdf(out+'datasets.hdf', 'mushroom', complib='blosc', complevel=9)

dim = 40
filtr = ImportanceSelect(rfc, dim)
wineX2 = filtr.fit_transform(wineX, wineY)
wine2 = pd.DataFrame(np.hstack((wineX2, np.atleast_2d(wineY).T)))
cols = list(range(wine2.shape[1]))
cols[-1] = 'Class'
wine2.columns = cols
wine2.to_hdf(out+'datasets.hdf', 'wine', complib='blosc', complevel=9)

In [None]:
# Random Projection

out = './RP/'
cmap = cm.get_cmap('Spectral') 

np.random.seed(0)

#raise
#%% data for 1

tmp = defaultdict(dict)
for i,dim in product(range(10), dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(mushroomX), mushroomX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out+'mushroom scree1.csv')


tmp = defaultdict(dict)
for i,dim in product(range(10),dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(wineX), wineX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out+'wine scree1.csv')


tmp = defaultdict(dict)
for i,dim in product(range(10),dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(mushroomX)    
    tmp[dim][i] = reconstructionError(rp, mushroomX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out+'mushroom scree2.csv')


tmp = defaultdict(dict)
for i,dim in product(range(10),dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(wineX)  
    tmp[dim][i] = reconstructionError(rp, wineX)
tmp =pd.DataFrame(tmp).T
tmp.to_csv(out+'wine scree2.csv')

#%% Data for 2

grid = {'rp__n_components': dims, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch}
rp = SparseRandomProjection(random_state=5)       
mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5)
pipe = Pipeline([('rp', rp), ('NN', mlp)])
gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

gs.fit(mushroomX, mushroomY)
tmp = pd.DataFrame(gs.cv_results_)
tmp.to_csv(out+'mushroom dim red.csv')


grid = {'rp__n_components': dims, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch}
rp = SparseRandomProjection(random_state=5)           
mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5)
pipe = Pipeline([('rp', rp), ('NN', mlp)])
gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

gs.fit(wineX, wineY)
tmp = pd.DataFrame(gs.cv_results_)
tmp.to_csv(out+'wine dim red.csv')
raise
#%% data for 3
# Set this from chart 2 and dump, use clustering script to finish up
dim = 10
rp = SparseRandomProjection(n_components=dim, random_state=5)

mushroomX2 = rp.fit_transform(mushroomX)
mushroom2 = pd.DataFrame(np.hstack((mushroomX2, np.atleast_2d(mushroomY).T)))
cols = list(range(mushroom2.shape[1]))
cols[-1] = 'Class'
mushroom2.columns = cols
mushroom2.to_hdf(out+'datasets.hdf', 'mushroom', complib='blosc', complevel=9)

dim = 60
rp = SparseRandomProjection(n_components=dim, random_state=5)
wineX2 = rp.fit_transform(wineX)
wine2 = pd.DataFrame(np.hstack((wineX2, np.atleast_2d(wineY).T)))
cols = list(range(wine2.shape[1]))
cols[-1] = 'Class'
wine2.columns = cols
wine2.to_hdf(out+'datasets.hdf', 'wine', complib='blosc', complevel=9)

In [None]:
# Benchmark

out = './BASE/'
np.random.seed(0)

#%% benchmarking for chart type 2

grid ={'NN__alpha':nn_reg, 'NN__hidden_layer_sizes': nn_arch}
mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5)
pipe = Pipeline([('NN', mlp)])
gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

gs.fit(mushroomX, mushroomY)
tmp = pd.DataFrame(gs.cv_results_)
tmp.to_csv(out+'mushroom NN bmk.csv')


mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5)
pipe = Pipeline([('NN', mlp)])
gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

gs.fit(wineX, wineY)
tmp = pd.DataFrame(gs.cv_results_)
tmp.to_csv(out+'wine NN bmk.csv')
raise

In [None]:
# Clustering
# python clustering.py PCA
# python clustering.py BASE
# python clustering.py ICA
# python clustering.py RP
# python clustering.py RF

out = './{}/'.format(sys.argv[1])

np.random.seed(0)

#%% Data for 1-3
SSE = defaultdict(dict)
ll = defaultdict(dict)
acc = defaultdict(lambda: defaultdict(dict))
adjMI = defaultdict(lambda: defaultdict(dict))
km = kmeans(random_state=5)
gmm = GMM(random_state=5)

st = clock()
for k in clusters:
    km.set_params(n_clusters=k)
    gmm.set_params(n_components=k)
    km.fit(mushroomX)
    gmm.fit(mushroomX)
    SSE[k]['mushroom'] = km.score(mushroomX)
    ll[k]['mushroom'] = gmm.score(mushroomX)    
    acc[k]['mushroom']['Kmeans'] = cluster_acc(mushroomY, km.predict(mushroomX))
    acc[k]['mushroom']['GMM'] = cluster_acc(mushroomY, gmm.predict(mushroomX))
    adjMI[k]['mushroom']['Kmeans'] = ami(mushroomY, km.predict(mushroomX))
    adjMI[k]['mushroom']['GMM'] = ami(mushroomY, gmm.predict(mushroomX))
    
    km.fit(wineX)
    gmm.fit(wineX)
    SSE[k]['wine'] = km.score(wineX)
    ll[k]['wine'] = gmm.score(wineX)
    acc[k]['wine']['Kmeans'] = cluster_acc(wineY, km.predict(wineX))
    acc[k]['wine']['GMM'] = cluster_acc(wineY, gmm.predict(wineX))
    adjMI[k]['wine']['Kmeans'] = ami(wineY, km.predict(wineX))
    adjMI[k]['wine']['GMM'] = ami(wineY, gmm.predict(wineX))
    print(k, clock()-st)
    
    
SSE = (-pd.DataFrame(SSE)).T
SSE.rename(columns=lambda x: x+' SSE (left)', inplace=True)
ll = pd.DataFrame(ll).T
ll.rename(columns=lambda x: x+' log-likelihood', inplace=True)
acc = pd.Panel(acc)
adjMI = pd.Panel(adjMI)

SSE.to_csv(out+'SSE.csv')
ll.to_csv(out+'logliklihood.csv')
acc.ix[:,:,'wine'].to_csv(out+'wine acc.csv')
acc.ix[:,:,'mushroom'].to_csv(out+'mushroom acc.csv')
adjMI.ix[:,:,'wine'].to_csv(out+'wine adjMI.csv')
adjMI.ix[:,:,'mushroom'].to_csv(out+'mushroom adjMI.csv')


#%% NN fit data (2,3)

grid = {'km__n_clusters': clusters, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch}
mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5)
km = kmeans(random_state=5)
pipe = Pipeline([('km', km), ('NN', mlp)])
gs = GridSearchCV(pipe, grid, verbose=10)

gs.fit(mushroomX, mushroomY)
tmp = pd.DataFrame(gs.cv_results_)
tmp.to_csv(out+'mushroom cluster Kmeans.csv')


grid = {'gmm__n_components': clusters, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch}
mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5)
gmm = myGMM(random_state=5)
pipe = Pipeline([('gmm', gmm), ('NN', mlp)])
gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

gs.fit(mushroomX, mushroomY)
tmp = pd.DataFrame(gs.cv_results_)
tmp.to_csv(out+'mushroom cluster GMM.csv')

grid = {'km__n_clusters': clusters, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch}
mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5)
km = kmeans(random_state=5)
pipe = Pipeline([('km', km), ('NN', mlp)])
gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

gs.fit(wineX, wineY)
tmp = pd.DataFrame(gs.cv_results_)
tmp.to_csv(out+'wine cluster Kmeans.csv')

grid = {'gmm__n_components': clusters, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch}
mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5)
gmm = myGMM(random_state=5)
pipe = Pipeline([('gmm', gmm), ('NN', mlp)])
gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

gs.fit(wineX, wineY)
tmp = pd.DataFrame(gs.cv_results_)
tmp.to_csv(out+'wine cluster GMM.csv')


# %% For chart 4/5
mushroomX2D = TSNE(verbose=10, random_state=5).fit_transform(mushroomX)
wineX2D = TSNE(verbose=10, random_state=5).fit_transform(wineX)

mushroom2D = pd.DataFrame(np.hstack((mushroomX2D, np.atleast_2d(mushroomY).T)), columns=['x','y','target'])
wine2D = pd.DataFrame(np.hstack((wineX2D, np.atleast_2d(wineY).T)), columns=['x','y','target'])

mushroom2D.to_csv(out+'mushroom2D.csv')
wine2D.to_csv(out+'wine2D.csv')

In [None]:
# madelon tricks
out = './PCA/'
cmap = cm.get_cmap('Spectral') 

np.random.seed(0)

mushroom = pd.read_hdf('./BASE/datasets.hdf','mushroom')        
mushroomX = mushroom.drop('Class',1).copy().values
mushroomY = mushroom['Class'].copy().values
scaler =StandardScaler()

mushroom_test = pd.read_hdf('./BASE/datasets.hdf','mushroom')        
mushroom_tstX = mushroom_test.drop('Class',1).copy().values
mushroom_tstY = mushroom_test['Class'].copy().values
from sklearn.ensemble import RandomForestClassifier

mushroomX = scaler.fit_transform(mushroomX)
mushroom_tstX = scaler.transform(mushroom_tstX)


#Reproduce best estimator so far
#if __name__=='__main__':
#    rfc = RandomForestClassifier(n_estimators=100,class_weight='balanced',random_state=5,n_jobs=7)
#    filtr = ImportanceSelect(rfc)
#    grid ={'filter__n':[20],'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_arch}
#    mlp = MLPClassifier(activation='relu',max_iter=2000,early_stopping=True,random_state=5)
#    pipe = Pipeline([('filter',filtr),('NN',mlp)])
#    gs = GridSearchCV(pipe,grid,verbose=10,cv=5)    
#    gs.fit(mushroomX,mushroomY)
#    print('Best CV Score {}'.format(gs.best_score_))
#    print('Test Score {}'.format(gs.score(mushroom_tstX,mushroom_tstY)))
#    rf_features = gs.best_estimator_.steps[0][1].model.feature_importances_.argsort()[::-1][:20]
    
    
# Use PCA to find true correct featuers
pca = PCA(random_state=5, n_components=500)
pca.fit(mushroomX)
ve = pd.Series(pca.explained_variance_)
ve.plot()
plt.xlabel('Component')
plt.ylabel('Variance Explained')
tmp = pd.DataFrame(pca.components_)
tmp=tmp.iloc[-15:,:]
pca_features=tmp.columns[tmp.abs().max()>0.1]

    
xx= mushroomX[:, pca_features]
xx_tst = mushroom_tstX[:, pca_features]

## NN testing - standard param set
#grid ={'alpha':nn_reg,'hidden_layer_sizes':nn_arch}
#mlp = MLPClassifier(activation='relu',max_iter=3000,early_stopping=False,random_state=5)
#gs = GridSearchCV(mlp,param_grid=grid,verbose=10,cv=5)
#gs.fit(mushroomX[:,pca_features],mushroomY)
#print('NN - Standard params - Best CV Score {}'.format(gs.best_score_))
#print('NN - Standard params - Test Score {}'.format(gs.score(xx_tst,mushroom_tstY)))
#
#
#
## NN testing - standard param set
#grid ={'alpha':[1e-4,1e-5,1e-6],'hidden_layer_sizes':[(200,100,100,64,100,100,200)]}
#mlp = MLPClassifier(activation='relu',max_iter=3000,early_stopping=False,random_state=5)
#gs = GridSearchCV(mlp,param_grid=grid,verbose=10,cv=5)
#gs.fit(mushroomX[:,pca_features],mushroomY)
#print('NN - Big network- Best CV Score {}'.format(gs.best_score_))
#print('NN - Big network - Test Score {}'.format(gs.score(xx_tst,mushroom_tstY)))


#KNN
knn = KNeighborsClassifier()
grid={'n_neighbors':range(1, 25, 1),'p':[1, 2], 'weights': ['uniform', 'distance']}
gs = GridSearchCV(knn, param_grid=grid, cv=5, verbose=10)
gs.fit(xx,mushroomY)
print('KNN - Best CV Score {}'.format(gs.best_score_))
print('KNN - Test Score {}'.format(gs.score(xx_tst, mushroom_tstY)))


# SVM
dis = pairwise_distances(xx)
m = np.median(dis)
gammas = [(1/m)*x for x in np.arange(0.1,2.1,0.1)]+[0.1,0.2,0.3,0.4,0.5]
gammas = np.arange(0.1,0.9,0.05)

gammas = [(1/m)*x for x in np.arange(0.1,2.1,0.1)]
param_grid={'gamma':gammas,'C':[10**x for x in [-1,0,1,2,3]]}
gs = GridSearchCV(SVC(kernel='rbf', C=1), param_grid=param_grid, cv=5, verbose=10, n_jobs=1)
gs.fit(xx, mushroomY)
print('SVM - Best CV Score {}'.format(gs.best_score_))
print('SVM - Test Score {}'.format(gs.score(xx_tst, mushroom_tstY)))