# Demo - SciKit Learn
* popular Python library providing efficient implementation of a large number of machine learning algorithms
* purposely designed to be clean and uniform across tools
* consistent data representation and common interface

# SciKit Learn Data Representation
<img style="height: 400px;" src="https://github.com/davewadestein/sf-ml-five-day/blob/main/src/images/scikit-learn-data.png?raw=1">

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets

np.random.seed(5)

# load the iris dataset that ships with scikit learn
iris = datasets.load_iris()
data = pd.DataFrame(data=np.c_[iris['data'], iris['target']],
                   columns=iris['feature_names'] + ['target'])
data.head()

In [None]:
# Set up the feature matrix
X_iris = data.drop('target', axis=1)
X_iris.shape

In [None]:
# Set up the target vector
y_iris = iris['target']
y_iris.shape

# scikit-learn Objects
* "All objects within scikit-learn share a uniform common basic API consisting of three complementary interfaces: an estimator interface for building and fitting models, a predictor interface for making predictions and a transformer interface for converting data."
<br>__*API design for machine learning software:
experiences from the scikit-learn project*__ (https://arxiv.org/pdf/1309.0238.pdf)

# Estimator API
* Driven by a set of principles documented in the above paper:
  * Consistency
  * Allow Inspection
  * Limited object hierarchies
  * Composition
  * Sensible defaults

# General Workflow
* Choose an algorithm
* Choose hyperparameters
* Arrange data into a features matrix and target vector
* Fit the model to the data with the __`fit()`__ method
* Apply the model to test data (__`predict()`__ or __`transform()`__)

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(max_depth=2)

# Let's split out dataset into training and test, in order to
# see how well our model actually does.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_iris, y_iris)
tree_clf.fit(X_train, y_train)

In [None]:
X_test.shape[0]

In [None]:
# Let's check the test data, one by one, and see how
# many we got right.
correct = 0

for i in range(X_test.shape[0]):
    row = X_test.iloc[i].to_numpy()
    pred = tree_clf.predict(row[np.newaxis, :])
    print('prediction for', row, 'is', pred[0], 'actual is', y_test[i], end='')
    if pred[0] == y_test[i]:
        correct += 1
    else:
        print('*', end='')
    print()
print(correct / X_test.shape[0])

In [None]:
tree_clf.score(X_test, y_test)

In [None]:
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

# create 100 random data points in 3 isotropic (uniform in all dimensions) blobs
X, y = make_blobs(random_state=9)
plt.scatter(X[:, 0], X[:, 1], c=y);

In [None]:
# Now let's do a K-means clustering...
# Since we made 3 blobs, the algorithm should do well with 3 clusters
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
plt.scatter(X[:, 0], X[:, 1], c=kmeans.labels_)
centers = kmeans.cluster_centers_
# mark the centers of each cluster, as determined by the K-means algorithm
plt.scatter(centers[:, 0], centers[:, 1], c='blue', s=300, marker='x');

In [None]:
# let's try it with 5 clusters
kmeans = KMeans(n_clusters=5)
kmeans.fit(X)
plt.scatter(X[:, 0], X[:, 1], c=kmeans.labels_)
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, marker='x');