# Agenda

*    What is the <b>K-nearest neighbors</b> classification model?
*    What are the four steps for model training and prediction in scikit-learn?
*    How can I apply this pattern to other machine learning models?



In [1]:
from IPython.display import IFrame
IFrame('http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', width=300, height=200)

In [2]:
# import load_iris function from datasets module
from sklearn.datasets import load_iris

# save "bunch" object containing iris dataset and its attributes
iris = load_iris()

# store feature matrix in "X"
X = iris.data

# store response vector in "y"
y = iris.target


In [3]:
# print the shapes of X and y
print(X.shape)
print(y.shape)

(150, 4)
(150,)



# scikit-learn 4-step modeling pattern

### Step 1: Import the class you plan to use


In [5]:
from sklearn.neighbors import KNeighborsClassifier

### Step 2: "Instantiate" the "estimator"

*    "Estimator" is scikit-learn's term for model
*    "Instantiate" means "make an instance of"



In [8]:
knn = KNeighborsClassifier(n_neighbors=1)

*    Name of the object does not matter
*    Can specify tuning parameters (aka "hyperparameters") during this step
*    All parameters not specified are set to their defaults



In [9]:
print(knn)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')


### Step 3: Fit the model with data (aka "model training")

*    Model is learning the relationship between X and y
*    Occurs in-place



In [11]:
knn.fit(X,y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

### Step 4: Predict the response for a new observation

*    New observations are called "out-of-sample" data
*    Uses the information it learned during the model training process



In [13]:
knn.predict([[3,5,4,2]])

array([2])

*    Returns a NumPy array
*    Can predict for multiple observations at once



In [14]:
X_new = [[3, 5, 4, 2], [5, 4, 3, 2]]
knn.predict(X_new)

array([2, 1])

# Using a different value for K

In [15]:
# instantiate the model (using the value K=5)
knn = KNeighborsClassifier(n_neighbors=5)

# fit the model with data
knn.fit(X, y)

# predict the response for new observations
knn.predict(X_new)



array([1, 1])

# Using a different classification model

In [17]:
# import the class
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
logreg = LogisticRegression()

print(logreg)

# fit the model with data
logreg.fit(X, y)

# predict the response for new observations
logreg.predict(X_new)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


array([2, 0])