# Daniel Lichter HW4: Judging Flowers

In [1]:
# Imports and pip installations (if needed)
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler

# Part 1: Load the dataset

In [2]:
# Load the dataset (load remotely, not locally)
iris_data = load_iris()
#converting the iris_data into a useable Pandas DataFrame
iris_df = pd.DataFrame(
  data=np.append(
    iris_data['data'], 
    np.array(iris_data['target']).reshape(len(iris_data['target']), 1), 
    axis=1),
  columns=np.append(iris_data['feature_names'], ['species']) 
)

In [3]:
# Output the first 15 rows of the data
iris_df.head(15)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0
5,5.4,3.9,1.7,0.4,0.0
6,4.6,3.4,1.4,0.3,0.0
7,5.0,3.4,1.5,0.2,0.0
8,4.4,2.9,1.4,0.2,0.0
9,4.9,3.1,1.5,0.1,0.0


In [4]:
# Display a summary of the table information (number of datapoints, etc.)
iris_df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


## About the dataset

Explain what the data is in your own words. What are your features and labels? What is the mapping of your labels to the actual classes?

For the iris flower dataset, our features are sepal length and width, as well as petal length and width. All of the features are measurements in centimeters. The label for the data is the species of each flower. In the dataset, we denote each of the labels by 0, 1, and 2. The 0,1,2 values of the species correspond to setosa, versicolor, and virginica flowers, respectively. We have 50 data points for each of the three species, giving a total of 150 rows of data.

In [5]:
print(iris_data['DESCR'])

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [6]:
print(iris_data['feature_names'])

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [7]:
print(iris_data['target_names'])

['setosa' 'versicolor' 'virginica']


In [8]:
print(iris_data['target'])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


# Part 2: Split the dataset into train and test

In [9]:
#Showing columns of iris_df
iris_df.columns

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)', 'species'],
      dtype='object')

In [10]:
iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2.0
146,6.3,2.5,5.0,1.9,2.0
147,6.5,3.0,5.2,2.0,2.0
148,6.2,3.4,5.4,2.3,2.0


In [11]:
# Take the dataset and split it into our features (X) and label (y)
#Giving X and y the names of their columns
X = iris_data['feature_names']
y = 'species'

Here I am normalizing the data to make the weights of each class more even throughout the dataset.

In [12]:
minMax = MinMaxScaler()
x_scaled = pd.DataFrame(minMax.fit_transform(iris_df[X]), columns = iris_data['feature_names'])
x_scaled

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,0.222222,0.625000,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.500000,0.050847,0.041667
3,0.083333,0.458333,0.084746,0.041667
4,0.194444,0.666667,0.067797,0.041667
...,...,...,...,...
145,0.666667,0.416667,0.711864,0.916667
146,0.555556,0.208333,0.677966,0.750000
147,0.611111,0.416667,0.711864,0.791667
148,0.527778,0.583333,0.745763,0.916667


In [13]:
# Use sklearn to split the features and labels into a training/test set. (90% train, 10% test)
X_train, X_test, y_train, y_test = train_test_split(
    x_scaled, iris_df[y], test_size=0.1, stratify = iris_df[y], random_state = 0)
#stratified the data so that we get an equal amount of each label
print(y_train.groupby(y_train).count())
print(y_test.groupby(y_test).count())

species
0.0    45
1.0    45
2.0    45
Name: species, dtype: int64
species
0.0    5
1.0    5
2.0    5
Name: species, dtype: int64


In [14]:
#Showing the shape of the split sets to show 90/10 split
#Since there are 150 rows in our dataset, we expect the training size to have 135 rows and the test to have 15
print("X_train size:", X_train.shape, "X_test size:", X_test.shape)
print("y_train size:", y_train.shape, "y_test size:", y_test.shape)

X_train size: (135, 4) X_test size: (15, 4)
y_train size: (135,) y_test size: (15,)


# Part 3: Logisitic Regression

In [15]:
# i. Use sklearn to train a LogisticRegression model on the training set
logReg = LogisticRegression()
logReg.fit(X_train, y_train)

LogisticRegression()

In [16]:
# ii. For a sample datapoint, predict the probabilities for each possible class
sampleData = logReg.predict_proba(X_test.sample(1))
    #np.array([[5.02,2.96,3.5,1.7]]))
print(sampleData)
print("Probability to be ____:", sampleData[0][0])
print("Probability to be ____:", sampleData[0][1])
print("Probability to be ____:", sampleData[0][2])

[[0.87171323 0.12410653 0.00418024]]
Probability to be ____: 0.8717132294956871
Probability to be ____: 0.1241065342473825
Probability to be ____: 0.004180236256930342


In [17]:
# iii. Report on the score for Logistic regression model, what does the score measure?
logReg.score(X_test, y_test)

0.8

Score measures the accuracy of the predictions by $\dfrac{number\ correctly\ predicted\ values}{number\ actual\ values}$ 

In [18]:
# iv. Extract the coefficents and intercepts for the boundary line(s)
print("Coeffecients for the boundary lines:")
print(logReg.coef_,"\n")
print("Intercepts for the boundary lines:")
print(logReg.intercept_)

Coeffecients for the boundary lines:
[[-1.3208138   1.49799229 -2.72951132 -2.66867386]
 [ 0.16777707 -1.46233447  0.51703969 -0.47813947]
 [ 1.15303674 -0.03565782  2.21247162  3.14681333]] 

Intercepts for the boundary lines:
[ 2.08712233  1.18253889 -3.26966122]


# Part 4: Support Vector Machine

In [31]:
# i. Use sklearn to train a Support Vector Classifier on the training set
svClassifier = svm.SVC(probability=True)
svClassifier.fit(X_train, y_train)

SVC(probability=True)

In [32]:
# ii. For a sample datapoint, predict the probabilities for each possible class
svmSampleData = svClassifier.predict_proba(X_test.sample(1))
    #np.array([[5.02,2.96,3.5,1.7]]))
print(svmSampleData)
print("Probability to be ____:", svmSampleData[0][0])
print("Probability to be ____:", svmSampleData[0][1])
print("Probability to be ____:", svmSampleData[0][2])

[[0.00664944 0.98246343 0.01088713]]
Probability to be ____: 0.006649435797663025
Probability to be ____: 0.9824634304209143
Probability to be ____: 0.010887133781422658


In [33]:
# iii. Report on the score for the SVM, what does the score measure?
svClassifier.score(X_test,y_test)

0.9333333333333333

Score measures the accuracy of the predictions by $\dfrac{number\ correctly\ predicted\ values}{number\ actual\ values}$ 

# Part 5: Neural Network

In [22]:
# i. Use sklearn to train a Neural Network (MLP Classifier) on the training set
nn = MLPClassifier()
nn.fit(X_train, y_train)



MLPClassifier()

In [23]:
# ii. For a sample datapoint, predict the probabilities for each possible class
nnSampleData = nn.predict_proba(X_test.sample(1))
print(nnSampleData)
print("Probability to be ____:", nnSampleData[0][0])
print("Probability to be ____:", nnSampleData[0][1])
print("Probability to be ____:", nnSampleData[0][2])

[[0.0082838  0.29456559 0.69715061]]
Probability to be ____: 0.008283803577664034
Probability to be ____: 0.29456558975404545
Probability to be ____: 0.6971506066682905


In [24]:
# iii. Report on the score for the Neural Network, what does the score measure?
nn.score(X_test,y_test)

0.8666666666666667

Score measures the accuracy of the predictions by $\dfrac{number\ correctly\ predicted\ values}{number\ actual\ values}$ 

In [30]:
# iv: Experiment with different options for the neural network, 
#report on your best configuration (the highest score I was able to achieve was 0.8666)
nn = MLPClassifier(solver = 'sgd', nesterovs_momentum=False)
nn.fit(X_train, y_train)
print('Stochastic Gradient Descent solver:', nn.score(X_test,y_test))

nn = MLPClassifier(learning_rate='adaptive', solver = 'sgd', max_iter= 500, early_stopping=True, 
                   hidden_layer_sizes= (1000,))
nn.fit(X_train, y_train)
print(nn.score(X_test,y_test))

nn = MLPClassifier(solver = 'adam', shuffle = False)
nn.fit(X_train, y_train)
print(nn.score(X_test,y_test))

nn = MLPClassifier(solver = 'lbfgs', max_fun=1000, max_iter=1000, tol=.001)
nn.fit(X_train, y_train)
print(nn.score(X_test,y_test))



Stochastic Gradient Descent solver: 0.6666666666666666
0.3333333333333333
0.8666666666666667
0.9333333333333333




# Part 6: K-Nearest Neighbors

In [26]:
# i. Use sklearn to 'train' a k-Neighbors Classifier
# Note: KNN is a nonparametric model and technically doesn't require training
# fit will essentially load the data into the model see link below for more information
# https://stats.stackexchange.com/questions/349842/why-do-we-need-to-fit-a-k-nearest-neighbors-classifier
knc = KNeighborsClassifier(weights='distance')
knc.fit(X_train, y_train)

KNeighborsClassifier(weights='distance')

In [27]:
# ii. For a sample datapoint, predict the probabilities for each possible class
kncSampleData = nn.predict_proba(X_test.sample(1))
print(nnSampleData)
print("Probability to be ____:", kncSampleData[0][0])
print("Probability to be ____:", kncSampleData[0][1])
print("Probability to be ____:", kncSampleData[0][2])

[[0.0082838  0.29456559 0.69715061]]
Probability to be ____: 1.4802329795920273e-53
Probability to be ____: 4.2121510554073656e-07
Probability to be ____: 0.9999995787848945


In [28]:
# iii. Report on the score for kNN, what does the score measure?
knc.score(X_test, y_test)

0.9333333333333333

Score measures the accuracy of the predictions by $\dfrac{number\ correctly\ predicted\ values}{number\ actual\ values}$ 

# Part 7: Conclusions and takeaways

In your own words describe the results of the notebook. Which model(s) performed the best on the dataset? Why do you think that is? Did anything surprise you about the exercise?

After running the four models, we got accuracy scores ranging from 0.8 to 0.9333. The SVM and KNN models both had scores of 0.9333, the Neural Net model had an accuracy of 0.867, and the logistic regression model had an accuracy of 0.8 . Regardless of the scores that we got, I think that the outcome of these models are limited due to the small sample size of the iris dataset. I think if we used a larger dataset, the neural network would have performed the best. I think the KNN model performed the best because the features used to make the classification, generally had similar flowers next to each other due to the small range of the standard deviation. 