In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.datasets import load_iris, load_breast_cancer

In [None]:
dataset = load_breast_cancer()

In [None]:
print(dataset.feature_names)
print(dataset.target_names)

In [None]:
X = dataset.data
y = dataset.target

In [None]:
X.shape

In [None]:
y

In [None]:
plt.scatter(X[:, 0], X[:, 1], c = y)

In [None]:
from sklearn.utils import check_random_state
rs = check_random_state(0)
missing = rs.randint(0, X.shape[0], 10)
missing

In [None]:
#plt.imshow(X[2].reshape(8,8))
X[missing, 0]

In [None]:
X[missing, 0]=np.nan

In [None]:
X

In [None]:
from collections import Counter
Counter(y)

# K Nearest Neighbors Classifier
It has one of the simplest learning strategies: given a new, unknown observation, look up in your reference database which ones have the closest features and assign the predominant class.

<img src="figures/knn.png">

In [None]:
from sklearn.neighbors import KNeighborsClassifier as KNN
knn = KNN(n_neighbors=3)
knn

In [None]:
knn.fit(X,y)

In [None]:
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy='mean')

In [None]:
imputer.fit(X)
imputer.transform(X)

In [None]:
X = imputer.fit_transform(X)

In [None]:
knn.fit(X,y)

In [None]:
knn.predict(X)

In [None]:
knn.predict_proba(X)

In [None]:
y_hat = knn.predict(X)

In [None]:
np.all(y_hat==y)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y, y_hat)

In [None]:
knn.score(X,y)

##  The holdout method  

<img src="figures/train_test_split.svg">


<img src="figures/train_test_split_matrix.svg">

In [None]:
X_train, X_test, y_train, y_test = tts(X, y,
                                       test_size=0.25,
                                       random_state=5)

In [None]:
score = knn.fit(X_train, y_train).score(X_test, y_test)
score

## K-Folds Cross-Validation
<img src="figures/cross_validation.svg">


In [None]:
result = cross_validate(knn, X, y, return_train_score=True, cv=5)
result

In [None]:
train_score = result['train_score'].mean()
test_score = result['test_score'].mean()
message = "Train score:{} \nTest score :{}".format(train_score, test_score)
print(message)

## Why the use of n_neighbors=3?

In [None]:
scores = []
n_neighbors = []
for i in range(1, 30):
    n_neighbors.append(i)
    knn.set_params(n_neighbors=i)
    result = cross_validate(knn, X, y, return_train_score=True, cv=5)
    score = result['test_score'].mean()
    scores.append(score)
    msg = "n_neighbors={} score={}".format(i, score)
    print(msg)
    
plt.scatter(n_neighbors, scores)
plt.xlabel('n_neighbors')
plt.ylabel('Accuracy')

## GridSearch
<img src="figures/grid_search_cross_validation.svg">

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = dict(n_neighbors=range(1, 30), metric=['l1', 'l2'])
knn_search = GridSearchCV(knn, param_grid=param_grid, cv=3)

In [None]:
knn_search.fit(X, y)

In [None]:
knn_search.best_params_

In [None]:
result = cross_validate(knn_search, X, y, verbose=3, return_train_score=True, cv=5)
result

In [None]:
train_score = result['train_score'].mean()
test_score = result['test_score'].mean()
message = "Train score:{} \nTest score :{}".format(train_score, test_score)
print(message)

## Decision Trees
Decision Trees (DTs) are a non-parametric supervised learning method used for classification and regression. The goal is to create a model that predicts the value of a target variable by learning simple decision rules inferred from the data features.
<img src="figures/tree.svg">

In [None]:
from sklearn.tree import DecisionTreeClassifier as Tree
tree = Tree(random_state=0)

In [None]:
tree.fit(X,y)
tree.score(X,y)

In [None]:
tree.fit(X_train, y_train).score(X_test, y_test)

In [None]:
result = cross_validate(tree, X, y, return_train_score=True, cv=5)
result

In [None]:
train_score = result['train_score'].mean()
test_score = result['test_score'].mean()
message = "Train score:{} \nTest score :{}".format(train_score, test_score)
print(message)

In [None]:
from sklearn.ensemble import RandomForestClassifier as RF
rf = RF(n_estimators=100, random_state=0)

In [None]:
result = cross_validate(rf, X, y, return_train_score=True, cv=5)
result

In [None]:
train_score = result['train_score'].mean()
test_score = result['test_score'].mean()
message = "Train score:{} \nTest score :{}".format(train_score, test_score)
print(message)

## Clustering

In [None]:
from sklearn.datasets import make_blobs

X, y  = make_blobs(n_samples=500, centers=7)
plt.scatter(X[:, 0], X[:, 1], c=y)

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=7, max_iter=300)
kmeans

In [None]:
kmeans.fit(X)

In [None]:
labels = kmeans.labels_
plt.scatter(X[:, 0], X[:, 1], c=labels)