In [2]:
from sklearn import datasets, metrics
from sklearn.cross_validation import train_test_split 
import pandas as pd
import numpy as np
from scipy import stats 

# Loading the data
Load the data into a pandas dataframe and split it into three different classes based on the three different outcomes (types of flowers): 
- `setosa`: 0
- `versicolor`: 1,
- `virginica`: 2

In [4]:
iris = datasets.load_iris()
X = iris.data
y = iris.target
outcomes = iris.target_names
features = iris.feature_names

# Construct a dataframe for managing the dataset
df = pd.DataFrame(np.c_[iris['data'], iris['target']],
                     columns=iris['feature_names'] + ['class'])

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [5]:
# Group training items by class
class_1 = df[df['class'] == 0]
class_2 = df[df['class'] == 1]
class_3 = df[df['class'] == 2]

# Print the shape
print(class_1.shape)
print(class_2.shape)
print(class_3.shape)

(50, 5)
(50, 5)
(50, 5)


In [6]:
# Helper function to split the data into train, test and validation sets
# with equal numbers of outcomes (0, 1, 2) in each
def split_data():
    test_size = 0.5
    columns = features.append('class')
    train = pd.DataFrame(columns=columns)
    test = pd.DataFrame(columns=columns)
    val = pd.DataFrame(columns=columns)
    
    for data in [class_1, class_2, class_3]:
        X = data.drop(['class'], axis=1)
        y = data['class']
        
        # Split the first time for train and (test + validation)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=1)

        # Split the test set into test and validation sets
        X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=test_size, random_state=1)
        
        train = pd.concat([train, pd.concat([X_train, y_train], axis=1)], ignore_index=True)
        test = pd.concat([test, pd.concat([X_test, y_test], axis=1)], ignore_index=True) 
        val = pd.concat([val, pd.concat([X_val, y_val], axis=1)], ignore_index=True) 
        
    return train, test, val

In [7]:
train, test, val = split_data()

print("Train:", train.shape)
print("Test:", test.shape)
print("Validation:", val.shape)

print("\nNumber of values per class in train set")
print(train['class'].value_counts())

print("\nNumber of values per class in test set")
print(test['class'].value_counts())

print("\nNumber of values per class in validation set")
print(val['class'].value_counts())

Train: (75, 5)
Test: (36, 5)
Validation: (39, 5)

Number of values per class in train set
2.0    25
1.0    25
0.0    25
Name: class, dtype: int64

Number of values per class in test set
2.0    12
1.0    12
0.0    12
Name: class, dtype: int64

Number of values per class in validation set
2.0    13
1.0    13
0.0    13
Name: class, dtype: int64


# Training the classifiers
- Train the classifiers 
- Get the accuracy of the predictions using the validation set
- Choose the best performing classifier based on the accuracy score and the chi squared effect size

First get the `train`, `test` and `validation` sets:

In [8]:
X_train = train.drop(['class'], axis=1).values
y_train = train['class'].values

X_test = test.drop(['class'], axis=1).values
y_test = test['class'].values

X_val = val.drop(['class'], axis=1).values
y_val = val['class'].values

In [9]:
def fit_predict(classifier, X):
    """
    Helper function for fitting the classifier and making
    prediction based on set X 
    """
    
    # Train the model using the training sets
    classifier.fit(X_train, y_train)  

    # Predict the response for test dataset
    return classifier.predict(X)  
    
def get_accuracy():
    """
    Helper function for getting accuracy metrics for 
    the model
    """
    
    chi = stats.chi2_contingency(pd.crosstab(index=y_val, columns=y_pred))
    effect = np.sqrt(chi[0]/(len(y_val)*2))

    print("Chi-squared effect:", effect)
    print("Sklearn accuracy:", metrics.accuracy_score(y_val, y_pred))
    print("\nConfusion matrix (predicted vs actual)")
    print(pd.DataFrame(np.c_[metrics.confusion_matrix(y_val, y_pred)]))  

## Support Vector Machine
- Create a Support Vector Machine classifier
- Train the SVM against the `training` set
- Get the accuracy of the model by comparing the predicted results to the actual results using the `validation` set

In [10]:
from sklearn.svm import SVC

# Create an svm Classifier
classifier = SVC(kernel='linear') # Linear Kernel

# Train the model and predict against the validation set
y_pred = fit_predict(classifier, X_val)  

# Get accuracy results
get_accuracy()

Chi-squared effect: 0.9309493362512629
Sklearn accuracy: 0.9487179487179487

Confusion matrix (predicted vs actual)
    0   1   2
0  13   0   0
1   0  13   0
2   0   2  11


## Decision Tree classifier
- Create a Decision Tree classifier classifier
- Train the decision tree against the `training` set
- Get the accuracy of the model by comparing the predicted results to the actual results using the `validation` set

In [11]:
from sklearn.tree import DecisionTreeClassifier  

classifier = DecisionTreeClassifier()  

# Train the model and predict against the validation set
y_pred = fit_predict(classifier, X_val)  

# Get accuracy results
get_accuracy()

Chi-squared effect: 0.9013878188659973
Sklearn accuracy: 0.9230769230769231

Confusion matrix (predicted vs actual)
    0   1   2
0  13   0   0
1   0  13   0
2   0   3  10


## Naive Bayes classifier
- Create a Multinomial Naive Bayes classifier and a Guassian Naive Bayes classifier
- Train the classifiers against the `training` set
- Get the accuracy of the models by comparing the predicted results to the actual results using the `validation` set

In [12]:
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()

# Train the model and predict against the validation set
y_pred = fit_predict(classifier, X_val)  

# Get accuracy results
get_accuracy()

Chi-squared effect: 0.9636241116594315
Sklearn accuracy: 0.9743589743589743

Confusion matrix (predicted vs actual)
    0   1   2
0  13   0   0
1   0  13   0
2   0   1  12


In [13]:
from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()

# Train the model and predict against the validation set
y_pred = fit_predict(classifier, X_val)  

# Get accuracy results
get_accuracy()

Chi-squared effect: 0.9636241116594315
Sklearn accuracy: 0.9743589743589743

Confusion matrix (predicted vs actual)
    0   1   2
0  13   0   0
1   0  13   0
2   0   1  12


# Naive Bayes Classifier test results
- Create a dataframe from the actual `y_test` values and the predicted (`y_pred`) values. 
- A new column is added which contains `1` for correctly predicted results, and `0` for incorrectly predicted results.
- The mean and standard deviation of this column will be used to calculate the accuracy and confidence intervals of the classifier. 

In [14]:
classifier = GaussianNB()

# Predict the response for test dataset
y_pred = fit_predict(classifier, X_test)  

# Create a dataframe from predicted vs actual
df = pd.DataFrame(np.c_[y_pred, y_test],
                     columns=['predicted', 'actual'])

# Create a new column for (predicted == actual) ? 1 : 0
df['new'] = np.where(df['predicted'] == df['actual'], 1, 0)

df

Unnamed: 0,predicted,actual,new
0,0.0,0.0,1
1,0.0,0.0,1
2,0.0,0.0,1
3,0.0,0.0,1
4,0.0,0.0,1
5,0.0,0.0,1
6,0.0,0.0,1
7,0.0,0.0,1
8,0.0,0.0,1
9,0.0,0.0,1


## Confidence interval
The 95% confidence interval defines a range of values that we can be 95% certain the accuracy for new test sets will fall between. In this case, the confidence interval tells us that we can be 95% certain that the classifier will always be between 91.8% and 100% accurate, given any new data set.

The formula used is:

$confidence = \overline{X} \pm t * \frac{s}{\sqrt(n)}$

Where `t` = 2.021.

The value for `t` is taken from the `t-distribution` table, based on the sample size (the length of the test set, which is 39), and the confidence level (95%).

In [15]:
# Calculate confidence intervals
print("Mean:", df['new'].mean())
print("Standard deviation:", df['new'].std())

sample_size = len(y_val)
error_margin = 2.021 * df['new'].std() / np.sqrt(sample_size)

conf_pos = df['new'].mean() + error_margin
conf_neg = df['new'].mean() - error_margin

print("\nConfidence between", conf_neg, "and", conf_pos, "(1.0)")

Mean: 0.9722222222222222
Standard deviation: 0.16666666666666657

Confidence between 0.9182857224155421 and 1.0261587220289023 (1.0)
