In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import operator
%matplotlib inline
from sklearn import datasets
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix

---
#The Iris Dataset Part 2
---

In [15]:
#The Iris Dataset is a builtin dataset, you need from sklearn import datasets - see above

iris = datasets.load_iris()
print iris.target_names

['setosa' 'versicolor' 'virginica']


---
##Construct a Pandas dataframe
---

In [16]:
df = {'class':iris.target, 'sepal_length':iris.data[:,0], 'sepal_width':iris.data[:,1], \
      'petal_length':iris.data[:,2], 'petal_width':iris.data[:,3]}

In [17]:
df = pd.DataFrame(df)

In [18]:
#We will need a list of features only
features = df.columns[1:]
print features

Index([u'petal_length', u'petal_width', u'sepal_length', u'sepal_width'], dtype='object')


In [19]:
df.tail()

Unnamed: 0,class,petal_length,petal_width,sepal_length,sepal_width
145,2,5.2,2.3,6.7,3.0
146,2,5.0,1.9,6.3,2.5
147,2,5.2,2.0,6.5,3.0
148,2,5.4,2.3,6.2,3.4
149,2,5.1,1.8,5.9,3.0


---
## Put the scaled data into a new dataframe
---

In [20]:
scaler = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True).fit(df[features])
df_scaled = pd.DataFrame(scaler.transform(df[features]), columns=features)

In [21]:
df_scaled['class'] = df['class'].values

In [22]:
df_scaled.head()

Unnamed: 0,petal_length,petal_width,sepal_length,sepal_width,class
0,-1.341272,-1.312977,-0.900681,1.032057,0
1,-1.341272,-1.312977,-1.143017,-0.124958,0
2,-1.398138,-1.312977,-1.385353,0.337848,0
3,-1.284407,-1.312977,-1.506521,0.106445,0
4,-1.341272,-1.312977,-1.021849,1.26346,0


---
##Define a custom confusion matrix function 
---

In [23]:
def my_confusion_matrix(predictions, y, names):
    '''This function uses the pd.crosstab function to create a confusion matrix:
    predictions are the predictions from the predictive mode
    y are the known class labels
    names are the names of the features used in the model'''
    
    cf = pd.crosstab(y, predictions)
    cf.columns = names
    cf.index = names
    cf.columns.name = 'Prediction'
    cf.index.name = 'Actual'
    return cf

---
#Full Training with Cross-Validation
---

In [24]:
def find_optimal_features(dfs, target, all_features, 
                          max_input_dimension = 5,
                          degree_list=[1, 2, 3, 4, 5, 6, 7, 8 ,9, 10],
                          reg_list = [1.0]):
    
    #initialize some results dictionarys
    results = {}
    results_vars  = {}
    #count will be used to index the dictionarys
    count = 0
    
    #Convert the dataframe data to user-friendly arrays
    X = dfs[all_features].values
    y = dfs[target].values
   
    #for classification I use the Stratified Shuffle Split for cross validation purposes
    sss = StratifiedShuffleSplit(y, n_iter=1, test_size=0.5, random_state=32) 
    
    #split the data first and the training set is what is used for all model parameters!!
    for train_index, test_index in sss:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]    
        
        #look for a good degree for the polynomial features
        for degree in degree_list:
    
            pf_model = PolynomialFeatures(degree).fit(X_train)
            pf = pf_model.transform(X_train)
        
            #if the polynomial features are not enough to satisfy the max input dimension, as specified by the user
            #then reduce the max input dimension
            if max_input_dimension > pf.shape[1]:
                max_input_dimension = pf.shape[1]
            
            #now use PCA to try models of varying dimensions
            for input_dimension in xrange(1, max_input_dimension+1):
        
                myPCA = PCA(n_components = input_dimension).fit(pf)
            
                #The final transformed dataset
                X_transform = myPCA.transform(pf)
        
                #Regularize the model
                for reg_C in reg_list:
                
                    #Finally build and fit the logistic regression model
                    clfLR = LogisticRegression(C=reg_C)
                    clfLR.fit(X_transform, y_train)
    
                    #Now test the performance on the validation set
                    #But first we have to prepare the test set data
                    #So get the non-linear features using the polynomial model
                    pf_test = pf_model.transform(X_test)
                    
                    #Now get the PCA features using the PCA model
                    X_test_transform = myPCA.transform(pf_test)
                    
                    #And finally get a predication from the logistic regression model
                    my_score = clfLR.score(X_test_transform, y_test)
                    
                    #We accumulate all results as we might like to look at more than just the 'best'
                    results[count] = my_score
                    results_vars[count] = [input_dimension, degree, reg_C, \
                                           clfLR, X_transform, y_train, X_test_transform, y_test, 
                                           pf_model, myPCA]
                    count += 1
                    
    return (results, results_vars)

In [25]:
#Set up the call
scores, input_vars = find_optimal_features(df_scaled, 'class', features, 
                                           max_input_dimension = 8, 
                                           degree_list = [2, 3, 4, 5, 6], 
                                           reg_list = np.logspace(-4, 6, 10))

In [26]:
#logspace test area - to see what the regularizer looks like
np.logspace(-4, 6, 10)

array([  1.00000000e-04,   1.29154967e-03,   1.66810054e-02,
         2.15443469e-01,   2.78255940e+00,   3.59381366e+01,
         4.64158883e+02,   5.99484250e+03,   7.74263683e+04,
         1.00000000e+06])

In [27]:
#Get the top ten models, based upon the scores
top_ten = sorted(scores.items(), key=operator.itemgetter(1), reverse=True)[:10]

In [28]:
#Let's examine the top ten models, and check out the input dimension, the polynomial degree and the regularizer
for i in xrange(len(top_ten)):
    print "model number {:d}".format(top_ten[i][0])
    print "score = {:5.5}".format(top_ten[i][1]), \
    "In Dim = {:d}".format(input_vars[top_ten[i][0]][0]), \
    "Deg = {:d}".format(input_vars[top_ten[i][0]][1]), \
    "C = {:7.4f}".format(input_vars[top_ten[i][0]][2])
    print "\n"

model number 144
score = 0.98667 In Dim = 7 Deg = 3 C =  2.7826


model number 53
score = 0.97333 In Dim = 6 Deg = 2 C =  0.2154


model number 63
score = 0.97333 In Dim = 7 Deg = 2 C =  0.2154


model number 73
score = 0.97333 In Dim = 8 Deg = 2 C =  0.2154


model number 125
score = 0.97333 In Dim = 5 Deg = 3 C = 35.9381


model number 126
score = 0.97333 In Dim = 5 Deg = 3 C = 464.1589


model number 143
score = 0.97333 In Dim = 7 Deg = 3 C =  0.2154


model number 145
score = 0.97333 In Dim = 7 Deg = 3 C = 35.9381


model number 153
score = 0.97333 In Dim = 8 Deg = 3 C =  0.2154


model number 215
score = 0.97333 In Dim = 6 Deg = 4 C = 35.9381




In [29]:
#But we are not done yet. We need to look at the confusion matrices for all the top models

for i in xrange(5):
    tn = top_ten[i][0]
    model = input_vars[tn][3]
    X_test = input_vars[tn][6]
    y_test = input_vars[tn][7]
    y_hat = model.predict(X_test)

    print "----------------------------------------"
    print "Model score = {:5.5f}".format(model.score(X_test, y_test))
    cm = my_confusion_matrix(y_hat, y_test, iris.target_names)
    print cm
    print "----------------------------------------\n"


----------------------------------------
Model score = 0.98667
Prediction  setosa  versicolor  virginica
Actual                                   
setosa          25           0          0
versicolor       0          25          0
virginica        0           1         24
----------------------------------------

----------------------------------------
Model score = 0.97333
Prediction  setosa  versicolor  virginica
Actual                                   
setosa          25           0          0
versicolor       0          25          0
virginica        0           2         23
----------------------------------------

----------------------------------------
Model score = 0.97333
Prediction  setosa  versicolor  virginica
Actual                                   
setosa          25           0          0
versicolor       0          25          0
virginica        0           2         23
----------------------------------------

----------------------------------------
Model score = 

In [31]:
#Using the best model let's predict something new
tn = top_ten[0][0]
new_input = [1.5, 0.45, 8.2, 3.7]

#input scaling first
new_input_scaled = scaler.transform(new_input)

#now the polynomial features
pf = input_vars[tn][8]
p_input = pf.transform(new_input_scaled)

#and now the PCA
myPCA = input_vars[tn][9]
new_X_transform = myPCA.transform(p_input)

#and now predict using the LR model
clfLR = input_vars[tn][3]
pred = clfLR.predict(new_X_transform)
print "This data is predicted to belong to class {:d}".format(pred[0]), \
    iris.target_names[pred[0]]
prob = clfLR.predict_proba(new_X_transform)
print "...with the following probabilities",
for i in xrange(3):
    print prob.ravel()[i],

This data is predicted to belong to class 1 versicolor
...with the following probabilities 0.374272850942 0.62572714802 1.03830398805e-09
