In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import operator
%matplotlib inline
from sklearn import datasets
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix

---
#The Iris Dataset Part 2
---

In [2]:
#The Iris Dataset is a builtin dataset, you need from sklearn import datasets - see above

iris = datasets.load_iris()
print iris.target_names

['setosa' 'versicolor' 'virginica']


---
##Construct a Pandas dataframe
---

In [3]:
df = {'class':iris.target, 'sepal_length':iris.data[:,0], 'sepal_width':iris.data[:,1], \
      'petal_length':iris.data[:,2], 'petal_width':iris.data[:,3]}

In [4]:
df = pd.DataFrame(df)

In [5]:
#We will need a list of features only
features = df.columns[1:]
print features

Index([u'petal_length', u'petal_width', u'sepal_length', u'sepal_width'], dtype='object')


In [6]:
df.tail()

Unnamed: 0,class,petal_length,petal_width,sepal_length,sepal_width
145,2,5.2,2.3,6.7,3.0
146,2,5.0,1.9,6.3,2.5
147,2,5.2,2.0,6.5,3.0
148,2,5.4,2.3,6.2,3.4
149,2,5.1,1.8,5.9,3.0


---
## Put the scaled data into a new dataframe
---

In [7]:
scaler = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True).fit(df[features])
df_scaled = pd.DataFrame(scaler.transform(df[features]), columns=features)

In [8]:
df_scaled['class'] = df['class'].values

In [9]:
df_scaled.head()

Unnamed: 0,petal_length,petal_width,sepal_length,sepal_width,class
0,-1.341272,-1.312977,-0.900681,1.032057,0
1,-1.341272,-1.312977,-1.143017,-0.124958,0
2,-1.398138,-1.312977,-1.385353,0.337848,0
3,-1.284407,-1.312977,-1.506521,0.106445,0
4,-1.341272,-1.312977,-1.021849,1.26346,0


---
##Define a custom confusion matrix function 
---

In [10]:
def my_confusion_matrix(predictions, y, names):
    '''This function uses the pd.crosstab function to create a confusion matrix:
    predictions are the predictions from the predictive mode
    y are the known class labels
    names are the names of the features used in the model'''
    
    cf = pd.crosstab(y, predictions)
    cf.columns = names
    cf.index = names
    cf.columns.name = 'Prediction'
    cf.index.name = 'Actual'
    return cf

---
#Full Training with Cross-Validation
---

In [49]:
def find_optimal_features(dfs, target, all_features, 
                          max_input_dimension = 5,
                          degree_list=[1, 2, 3, 4, 5, 6, 7, 8 ,9, 10],
                          reg_list = [1.0]):
    
    #initialize some results dictionarys
    results = {}
    results_vars  = {}
    temp_models = {}
    models = {}
    score = {}
    #count will be used to index the dictionarys
    count = 0
    
    #Convert the dataframe data to user-friendly arrays
    X = dfs[all_features].values
    y = dfs[target].values
   
    #for classification I use the Stratified Shuffle Split for cross validation purposes
    sss = StratifiedShuffleSplit(y, n_iter=1, test_size=0.5, random_state=32) 
    
    #split the data first and the training set is what is used for all model parameters!!
    for train_index, test_index in sss:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]    
        
        #look for a good degree for the polynomial features
        for degree in degree_list:
            the_in_dim = max_input_dimension
            
            pf_model = PolynomialFeatures(degree).fit(X_train)
            pf = pf_model.transform(X_train)
        
            #if the polynomial features are not enough to satisfy the max input dimension, as specified by the user
            #then reduce the max input dimension
            if the_in_dim > pf.shape[1]:
                the_in_dim = pf.shape[1]
            
            #now use PCA to try models of varying dimensions
            for input_dimension in xrange(1, the_in_dim + 1):
        
                myPCA = PCA(n_components = input_dimension).fit(pf)
            
                #The final transformed dataset
                X_transform = myPCA.transform(pf)
        
                #Regularize the model
                for reg_C in reg_list:
                
                    #Finally build and fit the logistic regression model
                    clfLR = LogisticRegression(C=reg_C)
                    clfLR.fit(X_transform, y_train)
    
                    #Now test the performance on the validation set
                    #But first we have to prepare the test set data
                    #So get the non-linear features using the polynomial model
                    pf_test = pf_model.transform(X_test)
                    
                    #Now get the PCA features using the PCA model
                    X_test_transform = myPCA.transform(pf_test)
                    
                    #And finally get a predication from the logistic regression model
                    my_score = clfLR.score(X_test_transform, y_test)
                    
                    #We accumulate all results as we might like to look at more than just the 'best'
                    m={}
                    m['score'] = my_score
                    m['indim'] = input_dimension
                    m['deg'] = degree
                    m['reg'] = reg_C
                    m['clf'] = clfLR
                    m['xtrain'] = X_transform
                    m['ytrain'] = y_train
                    m['xtest'] = X_test_transform
                    m['ytest'] = y_test
                    m['poly'] = pf_model
                    m['pca'] = myPCA
                    
                    #print "here",  m['ascore']
                    temp_models[count] = m
                    score[count] = my_score
                    count += 1
                    
    #sort the score dictionary
    score = sorted(score.items(), key=operator.itemgetter(1), reverse=True)
    
    #and then using the sorted score dictionary sort the main results dictionary
    for i in xrange(len(score)):
        models[i] = temp_models[score[i][0]]
        
    #now just return the sorted resulst dictionary
    return models

In [50]:
#Set up the call
models = find_optimal_features(df_scaled, 'class', features, 
                                           max_input_dimension = 10, 
                                           degree_list = [2, 3, 4, 5, 6, 7], 
                                           reg_list = np.logspace(-4, 6, 10))

In [51]:
#Each model is access by an index into models, but they are sorted
#The model with the best score is always model[0], and the next best is model[1], etc etc etc...
#Then each results field can be access by name
#As you will see in the forth coming cells this makes things simpler and more intelligible
print models[0]['score']
print models[0]['clf']
print models[0]['indim']

0.986666666667
LogisticRegression(C=2.7825594022071258, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, penalty='l2',
          random_state=None, tol=0.0001)
7


In [52]:
#Let's examine the top ten models, and check out the input dimension, the polynomial degree and the regularizer
for i in xrange(10):
    print "model number {:1d}".format(i)
    print "score = {:8.5}".format(models[i]['score']), \
    "In Dim = {:3d}".format(models[i]['indim']), \
    "Deg = {:2d}".format(models[i]['deg']), \
    "C = {:7.4f}".format(models[i]['reg'])
    print "\n"

model number 0
score =  0.98667 In Dim =   7 Deg =  3 C =  2.7826


model number 1
score =  0.97333 In Dim =   6 Deg =  2 C =  0.2154


model number 2
score =  0.97333 In Dim =   7 Deg =  2 C =  0.2154


model number 3
score =  0.97333 In Dim =   8 Deg =  2 C =  0.2154


model number 4
score =  0.97333 In Dim =   9 Deg =  2 C =  0.2154


model number 5
score =  0.97333 In Dim =   9 Deg =  2 C =  2.7826


model number 6
score =  0.97333 In Dim =  10 Deg =  2 C =  0.2154


model number 7
score =  0.97333 In Dim =  10 Deg =  2 C =  2.7826


model number 8
score =  0.97333 In Dim =   5 Deg =  3 C = 35.9381


model number 9
score =  0.97333 In Dim =   5 Deg =  3 C = 464.1589




In [53]:
#But we are not done yet. We need to look at the confusion matrices for all the top models

for i in xrange(5):
    print "----------------------------------------"
    print "Model score = {:5.5f}".format(models[i]['clf'].score(models[i]['xtest'], models[i]['ytest']))
    cm = my_confusion_matrix(models[i]['clf'].predict(models[i]['xtest']), models[i]['ytest'], iris.target_names)
    print cm
    print "----------------------------------------\n"


----------------------------------------
Model score = 0.98667
Prediction  setosa  versicolor  virginica
Actual                                   
setosa          25           0          0
versicolor       0          25          0
virginica        0           1         24
----------------------------------------

----------------------------------------
Model score = 0.97333
Prediction  setosa  versicolor  virginica
Actual                                   
setosa          25           0          0
versicolor       0          25          0
virginica        0           2         23
----------------------------------------

----------------------------------------
Model score = 0.97333
Prediction  setosa  versicolor  virginica
Actual                                   
setosa          25           0          0
versicolor       0          25          0
virginica        0           2         23
----------------------------------------

----------------------------------------
Model score = 

In [40]:
#Using the best model let's predict something new
new_input = [1.5, 0.45, 8.2, 3.7]

#input scaling first
new_input_scaled = scaler.transform(new_input)

#now the polynomial features
p_input = models[0]['poly'].transform(new_input_scaled)

#and now the PCA
new_X_transform = models[0]['pca'].transform(p_input)

#and now predict using the LR model
pred = models[0]['clf'].predict(new_X_transform)


print "This data is predicted to belong to class {:d}".format(pred[0]), \
    iris.target_names[pred[0]]
    
prob = models[0]['clf'].predict_proba(new_X_transform)
print "...with the following probabilities:\n"
for i in xrange(3):
    print "Probability of belonging to the class {:11s} = {:5.2f}".format(iris.target_names[i],prob.ravel()[i])

This data is predicted to belong to class 0 setosa
...with the following probabilities:

Probability of belonging to the class setosa      =  0.41
Probability of belonging to the class versicolor  =  0.29
Probability of belonging to the class virginica   =  0.30
