In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Read data

In [2]:
filename = "pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = pd.read_csv(filename, names=names)

data.head(20)

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [3]:
data.shape

(768, 9)

In [4]:
data.dtypes

preg       int64
plas       int64
pres       int64
skin       int64
test       int64
mass     float64
pedi     float64
age        int64
class      int64
dtype: object

### Separate dataset

In [5]:
data = data.values

In [6]:
X = data[:, 0:8]
Y = data[:, 8]

### Split dataset

In [7]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits=10, random_state=7)

## Linear Algorithms

### Logistic Regression
-  assumes a Gaussian distribution for the numeric input variables and can model binary classification problems

In [8]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

In [9]:
from sklearn.model_selection import cross_val_score

results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.7695146958304853


### LDA - Linear Discriminant Analysis
-  a statistical technique for binary and multiclass classification
-  assumes a Gaussian distribution for the numerical input variables

In [10]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

model = LinearDiscriminantAnalysis()

In [11]:
from sklearn.model_selection import cross_val_score

results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.773462064251538


## Non Linear Algorithms

### k-Nearest Neighbors
-  uses a distance metric to find the k most similar instances in the training data for a new instance and takes the mean outcome of the neighbors as the prediction

In [12]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()

In [13]:
from sklearn.model_selection import cross_val_score

results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.7265550239234451


### Naive Bayes
-  calculates the probability of each class and the conditional probability of each class given each input value
-  These probabilities are estimated for new data and multiplied together, assuming that they are all independent (a simple or naive assumption)
-  When working with real-valued data, a Gaussian distribution is assumed to easily estimate the probabilities for input variables using the Gaussian Probability Density Function

In [14]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

In [15]:
from sklearn.model_selection import cross_val_score

results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.7551777170198223


### Classification and Regression Trees
- construct a binary tree from the training data
-  Split points are chosen greedily by evaluating each attribute and each value of each attribute in the training data in order to minimize a cost function

In [16]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()

In [17]:
from sklearn.model_selection import cross_val_score

results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.6900205058099795


### Support Vector Machines
- seek a line that best separates two classes
-  data instances that are closest to the line that best separates the classes are called support vectors and influence where the line is placed

In [18]:
from sklearn.svm import SVC

model = SVC()

In [19]:
from sklearn.model_selection import cross_val_score

results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.6510252904989747
