In [1]:

#  Define and load everything we need so that this
#  tutorial is complete by itself.

import scipy as sp 
import numpy as np 
import matplotlib as mpl 
import pandas as pd 
import sklearn 

from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

#  Enable inline plotting from matplotlib

%matplotlib inline

In [2]:

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"

#  Define the names of the variables as we want them

names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'species']
iris = pd.read_csv(url, names=names)

In [3]:
print(iris.shape)

In [3]:
print(iris.head())

In [3]:
print(iris.describe())#  Use the groupby method to determine the class distribution

In [4]:
print(iris.groupby('species').size())

species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
dtype: int64


In [5]:
#  Convert from pandas dataframe to numpy array
#  Most of the scikit-learn libraries expect numpy arrays.
array = iris.values
print (type(array))

<class 'numpy.ndarray'>


In [None]:
#  From the charts, it appears that Iris-setosa can be separated by a straight line.  
#  We will test machine learning of one-versus-others by defining the target to 
#  be +1 for Iris-setosa, 0 for all others
#  Use all rows, column 4, to create vector raw_target.
raw_target = array[:,4]
#  Use all rows f raw_target to create the y array,
#  setting the rows that are Iris-setosa to +1, all others to 0.
y = np.where(raw_target == 'Iris-virginica', 1, 0)
print (y)

In [None]:

#  Split the data into feature variables, X, and target variable, y
#  Feature variables are in the first four columns -- indexes 0,1,2,3
#  Target variable is in column 4

#  Note the use of upper case X and lower case y.
#  This is the convention used to distinguish between matrices, X, 
#  and vectors, y, as the expression describing the model is AX = y.

#  Use all rows, columns 0, 1, 2, 3.
X = array[:,0:4]

#  Expect X to be 150 rows, 4 columns
print (type(X))
print ('Dimensions of X: ', X.shape)

#  We converted the labels to 0/1 integers in the cell above
#  Expect y to be 150 rows, 1 column
print (type(y))
print ('Dimensions of y: ', y.shape)

In [None]:

#  Split the data into training and test sets
#  Selection is made by randomly choosing rows.
#  Test data is 20% of the total number of rows -- 30 rows.

test_size = 0.20
seed = 7
X_train, X_test, y_train, y_test = \
    model_selection.train_test_split(X, y, test_size=test_size, random_state=seed)
    
print ('Dimensions of X_train: ', X_train.shape)
print ('Dimensions of X_test:  ', X_test.shape)
print ('Dimensions of y_train: ', y_train.shape)
print ('Dimensions of y_test:  ', y_test.shape)
print ('Head of X_train: \n', X_train[:10])
print ('Head of X_test:  \n', X_test[:10])
print ('Head of y_train: ', y_train[:10])
print ('Head of y_test:  ', y_test[:10])

In [None]:

model = LogisticRegression(solver='lbfgs')
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
cm = confusion_matrix(y_pred,y_test)
print ('                        Predicted')
print ('                   Positive','Negative')
print ('Actually Positive (1): ', cm[1,1], '    ', cm[1,0])
print ('Actually Negative (0): ', cm[0,1], '    ', cm[0,0])