In [1]:
import pandas as pd

## Scikit learn's requirements

Features and Response should be:

1. Separate
1. Numeric
1. Numpy Arrays
1. Specific Shape

In [2]:
iris_df = pd.read_csv('../data/iris.csv', dtype = {'species': 'category'})
iris_df.head(3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa


## Conventions

1. Features should be stored in **X**
1. Target in **y**
1. **X** should be capitalized since it stores a matrix

In [3]:
feature_cols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

X = iris_df[feature_cols]
y = iris_df.species

## Scikit Learn 4-step modelling pattern
1. Import the Class we want to use
1. Instantiate the Estimator
1. Train the Model
1. Predict the Response

In [4]:
from sklearn.neighbors import KNeighborsClassifier

In [5]:
knn = KNeighborsClassifier(n_neighbors = 1)
knn

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

In [6]:
knn.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

In [7]:
X_test = [[3, 5, 4, 2], [5, 4, 3, 2]]

knn.predict(X_test)

array(['virginica', 'versicolor'], dtype=object)

## Model Tuning
**Hyperparameters** linke **n_neighbors** here are tweaked a little in hopes of better generalization

In [8]:
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X, y)
knn.predict(X_test)

array(['versicolor', 'versicolor'], dtype=object)

## Using a different Classification Model

In [9]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X, y)
logreg.predict(X_test)

array(['virginica', 'setosa'], dtype=object)