# Binary Classifier - the Iris Dataset

In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    file = fig_id + "." + fig_extension
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(file, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [2]:
import pandas as pd
# data from https://archive.ics.uci.edu/ml/machine-learning-databases/iris/
def load_iris_data():
    csv_file =  'iris.data'
    return pd.read_csv(csv_file, sep = ',', header = None)

In [3]:
iris = load_iris_data()
iris.head()

FileNotFoundError: ignored

In [None]:
iris[4].replace({'Iris-setosa': 0, 'Iris-virginica': 0, 'Iris-versicolor' : 1}, inplace = True)
iris

In [None]:
# to make this notebook's output identical at every run
np.random.seed(42)

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(iris, test_size=0.2, random_state=42)

In [None]:
test_set.head()

In [None]:
iris = train_set.drop(4, axis=1) # drop labels for training set
iris_labels = train_set[4].copy()

In [None]:
train_set.head()

In [None]:
iris.head()

In [None]:
iris_labels

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('std_scaler', StandardScaler()),
    ])

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
num_attribs = list(iris)

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
    ])

iris_prepared = full_pipeline.fit_transform(iris)

In [None]:
iris_prepared

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(random_state=42)
sgd_clf = SGDClassifier(random_state=42)

sgd_clf.fit(iris_prepared, iris_labels)
rf_clf.fit(iris_prepared, iris_labels)

In [None]:
some_data = iris.iloc[:10]
some_labels = iris_labels[:10]
some_data_prepared = full_pipeline.transform(some_data)

print("Predictions:", sgd_clf.predict(some_data_prepared))

In [None]:
print("Labels:", list(some_labels))

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, iris_prepared, iris_labels, cv=3, scoring="accuracy")

In [None]:
from sklearn.model_selection import cross_val_predict

iris_train_predictions = cross_val_predict(sgd_clf, iris_prepared, iris_labels, cv=3)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(iris_labels, iris_train_predictions)

In [None]:
iris_train_scores_sgd = cross_val_predict(sgd_clf, iris_prepared, iris_labels, cv=3, method="decision_function")
iris_train_scores_rf = cross_val_predict(rf_clf, iris_prepared, iris_labels, cv=3, method="predict_proba")
y_scores_forest = iris_train_scores_rf[:, 1]

iris_train_scores_sgd[:5]

In [None]:
from sklearn.metrics import roc_curve

fpr_sgd, tpr_sgd, thresholds_sgd = roc_curve(iris_labels, iris_train_scores_sgd)
fpr_rf, tpr_rf, thresholds_rf = roc_curve(iris_labels, y_scores_forest)

In [None]:
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)

plt.figure(figsize=(8, 6))
plt.plot(fpr_sgd, tpr_sgd, "b:", linewidth=2, label="SGD")
plot_roc_curve(fpr_rf, tpr_rf, "Random Forest")
plt.legend(loc="lower right", fontsize=16)
save_fig("roc_curve_comparison_plot")
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(iris_labels, iris_train_scores_sgd)

In [None]:
roc_auc_score(iris_labels, y_scores_forest)