In [1]:
# - Spot-checking is a way of discovering which algorithms perform well on your machine learning problem. 

# - You cannot know which algorithms are best suited to your problem beforehand.

In [2]:
# 11.1 Algorithm Spot Checking

In [3]:
# - You must use trial and error to discover a shortlist of algorithms that do well on your problem that you can 
# then double down on and tune further. This process is spot-checking.

# - Below are some suggestions when spot-checking algorithms on your dataset:
    # - Try a mixture of algorithm representations (e.g. instances and trees).
    # - Try a mixture of learning algorithms (e.g. different algorithms for learning the same type of representation).
    # - Try a mixture of modeling types (e.g. linear and nonlinear functions or parametric and nonparametric).

In [4]:
# 11.2 Algorithms Overview

In [5]:
# - Linear Machine Learning Algorithms (for classification)
    # - Logistic Regression.
    # - Linear Discriminant Analysis.
    
# - Non-Linear Machine Learning Algorithms (for classification)
    # - k-Nearest Neighbors.
    # - Naive Bayes.
    # - Classification and Regression Trees.
    # - Support Vector Machines.

In [6]:
from pandas import read_csv

In [7]:
import numpy

In [8]:
import sys

In [9]:
def print_data(_data):
    return numpy.savetxt(sys.stdout, _data[:5,:], '%5.3f')

In [10]:
_uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data'

In [11]:
_col_names = ['preg','plas','pres','skin','test','mass','pedi','age','class']

In [12]:
_dataframe = read_csv(_uri, names=_col_names)

In [13]:
_array = _dataframe.values

In [14]:
print_data(_array)

6.000 148.000 72.000 35.000 0.000 33.600 0.627 50.000 1.000
1.000 85.000 66.000 29.000 0.000 26.600 0.351 31.000 0.000
8.000 183.000 64.000 0.000 0.000 23.300 0.672 32.000 1.000
1.000 89.000 66.000 23.000 94.000 28.100 0.167 21.000 0.000
0.000 137.000 40.000 35.000 168.000 43.100 2.288 33.000 1.000


In [15]:
_X = _array[:,0:8]

In [16]:
print_data(_X)

6.000 148.000 72.000 35.000 0.000 33.600 0.627 50.000
1.000 85.000 66.000 29.000 0.000 26.600 0.351 31.000
8.000 183.000 64.000 0.000 0.000 23.300 0.672 32.000
1.000 89.000 66.000 23.000 94.000 28.100 0.167 21.000
0.000 137.000 40.000 35.000 168.000 43.100 2.288 33.000


In [17]:
_Y = _array[:,8:]

In [18]:
print_data(_Y)

1.000
0.000
1.000
0.000
1.000


In [19]:
_Y = numpy.ravel(_Y)

In [20]:
print(_Y[:5])

[ 1.  0.  1.  0.  1.]


In [21]:
from sklearn.model_selection import KFold

In [22]:
from sklearn.model_selection import cross_val_score

In [23]:
_kfold = KFold(n_splits=10, random_state=7)

In [24]:
_scoring = 'accuracy'

In [25]:
# 11.3 Linear Machine Learning Algorithms

In [26]:
# 11.3.1 Logistic Regression

In [27]:
# - Logistic regression assumes a Gaussian distribution for the numeric input variables 
# and can model binary classification problems.

In [28]:
from sklearn.linear_model import LogisticRegression

In [29]:
_model = LogisticRegression()

In [30]:
_score = cross_val_score(_model, _X, _Y, cv=_kfold, scoring=_scoring)

In [31]:
'{:.3%}'.format(_score.mean())

'76.951%'

In [32]:
# 11.3.2 Linear Discriminant Analysis

In [33]:
# - Linear Discriminant Analysis or LDA is a statistical technique for binary and multiclass classification. 

# - It too assumes a Gaussian distribution for the numerical input variables

In [34]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [35]:
_model = LinearDiscriminantAnalysis()

In [36]:
_score = cross_val_score(_model, _X, _Y, cv=_kfold, scoring=_scoring)

In [37]:
'{:.3%}'.format(_score.mean())

'77.346%'

In [38]:
# 11.4 Nonlinear Machine Learning Algorithms

In [39]:
# 11.4.1 k-Nearest Neighbors

In [40]:
# - The k-Nearest Neighbors algorithm (or KNN) uses a distance metric to find the k most similar instances 
# in the training data for a new instance and takes the mean outcome of the neighbors as the prediction.

In [41]:
from sklearn.neighbors import KNeighborsClassifier

In [42]:
_model = KNeighborsClassifier()

In [43]:
_score = cross_val_score(_model, _X, _Y, cv=_kfold, scoring=_scoring)

In [44]:
'{:.3%}'.format(_score.mean())

'72.656%'

In [45]:
# 11.4.2 Naive Bayes

In [46]:
# - Naive Bayes calculates the probability of each class and the conditional probability of each class 
# given each input value. 

# - These probabilities are estimated for new data and multiplied together, assuming that they are all 
# independent (a simple or naive assumption). 

# - When working with real-valued data, a Gaussian distribution is assumed to easily estimate the 
# probabilities for input variables using the Gaussian Probability Density Function. 

In [47]:
from sklearn.naive_bayes import GaussianNB

In [48]:
_model = GaussianNB()

In [49]:
_score = cross_val_score(_model, _X, _Y, cv=_kfold, scoring=_scoring)

In [50]:
'{:.3%}'.format(_score.mean())

'75.518%'

In [51]:
# 11.4.3 Classification and Regression Trees

In [52]:
# - Classification and Regression Trees (CART or just decision trees) construct a binary tree from the 
# training data. 

# - Split points are chosen greedily by evaluating each attribute and each value of each attribute in 
# the training data in order to minimize a cost function (like the Gini index).

In [53]:
from sklearn.tree import DecisionTreeClassifier

In [54]:
_model = DecisionTreeClassifier()

In [55]:
_score = cross_val_score(_model, _X, _Y, cv=_kfold, scoring=_scoring)

In [56]:
'{:.3%}'.format(_score.mean())

'69.000%'

In [57]:
# 11.4.4 Support Vector Machines

In [58]:
# - Support Vector Machines (or SVM) seek a line that best separates two classes. 

# - Those data instances that are closest to the line that best separates the classes 
# are called support vectors and influence where the line is placed. 

# - SVM has been extended to support multiple classes. 

# - Of particular importance is the use of different kernel functions via the kernel 
# parameter. 

# - A powerful Radial Basis Function is used by default.

In [59]:
from sklearn.svm import SVC

In [60]:
_model = SVC()

In [61]:
_score = cross_val_score(_model, _X, _Y, cv=_kfold, scoring=_scoring)

In [62]:
'{:.3%}'.format(_score.mean())

'65.103%'