## Building Predictive Models (part 1)
Supervised learning with classification and regression

### Functions p.4

In [None]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
# from sklearn.linear_model import Perceptron
from sklearn.preprocessing import StandardScaler
from Data.Perceptron import Perceptron
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression


In [None]:
x = np.arange(9)
x

In [None]:
result = np.where(x<2)
print(result)

In [None]:
result = np.where(x == 0, 'YES', 'no')
print(result)

### Train-test split with sklearn p.23

In [None]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [None]:
my_test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=my_test_size)

In [None]:
print('size of train set [%]:', X_train.size / X.size)
print('size of test set [%]:', X_test.size / X.size)

In [None]:
def split_dataset(X,y,my_test_size):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=my_test_size)
    
    plt.hist(X[:,0])
    plt.hist(X_train[:,0])
    plt.hist(X_test[:,0])
    plt.legend(['Full set', 'Train set', 'Test set'])
    plt.show()

    return X_train, X_test, y_train, y_test


In [None]:
split_dataset(X,y,my_test_size)

### Classification with plotting p.29

In [None]:
df_iris = pd.read_csv('Data/iris.data', header = None)
df_iris.head()

In [None]:
# Select setosa ans versicolor
y = df_iris.iloc[0:100, 4].values
y = np.where(y == 'Iris-setosa', -1, 1)

# extract sepal length and petal length
X = df_iris.iloc[0:100, [0,2]].values

In [None]:
plt.scatter(X[:50,0], X[:50,1], 
            color = 'red', 
            marker = 'o',
            label = 'setosa')
plt.scatter(X[50:100,0], X[50:100,1], 
            color = 'blue', 
            marker = 'o',
            label = 'versicolor')

plt.title('Iris dataset')
plt.xlabel('Sepal length[cm]')
plt.ylabel('Petal length[cm]')
plt.legend(loc = 'upper left')

plt.show()

### Perceptron with sklearn p. 50
activation function, objective function and learning rate

In [None]:
# class Perceptron(object):
#     def __init__(self, eta, n_iter):
#         # Initialization 

#     def fit(self, X, y):
#         # fit training data
    
#     def net_input(self, X):
#         # calculate net input
    
#     def predict(self, X):
#         # return class label

In [None]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = split_dataset(X,y,0.3)

In [None]:
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [None]:
ppn = Perceptron(eta=0.1)
ppn.fit(X_train_std, y_train)
y_pred = ppn.predict(X_test_std)


In [None]:
print('Misclassified examples: %d'
      % (y_test != y_pred).sum())
print('Accuracy: %.2f'
      % accuracy_score(y_test, y_pred))


### Logistic regression p.60

In [None]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

my_test_size = 0.3

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=my_test_size)

In [None]:
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [None]:
lr = LogisticRegression()
lr.fit(X_train_std, y_train)
y_test_pred = lr.predict_proba(X_test_std)*100
y_test_pred = y_test_pred.astype(int)

In [None]:
print('True label:\n', y_test[-5:].reshape(5,1))
print('Predicted probabilities:\n', y_test_pred[-5:])

### Linearly inseparable data p. 67

In [None]:
from matplotlib.colors import ListedColormap

def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):

    # setup marker generator and color map
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])

    # plot the decision surface
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                           np.arange(x2_min, x2_max, resolution))
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())

    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
                    alpha=0.8, c=cmap(idx),
                    marker=markers[idx], label=cl)

    # highlight test samples
    if test_idx:
        # plot all samples
        if not versiontuple(np.__version__) >= versiontuple('1.9.0'):
            X_test, y_test = X[list(test_idx), :], y[list(test_idx)]
            warnings.warn('Please update to NumPy 1.9.0 or newer')
        else:
            X_test, y_test = X[test_idx, :], y[test_idx]

        plt.scatter(X_test[:, 0],
                    X_test[:, 1],
                    c='',
                    alpha=1.0,
                    linewidths=1,
                    marker='o',
                    s=55, label='test set')

In [None]:
nonlin_df = pd.read_csv('Data/nonlinear_data.csv')
nonlin_df.head()

In [None]:
X = nonlin_df.loc[:,['x0', 'x1']].values
y = nonlin_df.loc[:,'y'].values

#### Support Vector Machine (SVM)

In [None]:
from sklearn.svm import SVC

svm = SVC(kernel='rbf', gamma=0.10, C=10.0)
svm.fit(X, y)

plot_decision_regions(X, y, classifier=svm)
plt.legend()
plt.legend()
plt.xlabel('X0')
plt.ylabel('X1')
plt.title('SVM Classifier')

#### K-Nearest Neighbors (KNN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5, 
                           p=2, 
                           
                           metric = 'minkowski')

knn.fit(X,y)

plot_decision_regions(X,y, classifier=knn)
plt.legend()
plt.legend()
plt.xlabel('X0')
plt.ylabel('X1')
plt.title('KNN Classifier')

#### Test with Circle Data

In [None]:
circle_df = pd.read_csv('Data/circle_data.csv')
circle_df.head()

In [None]:
X = circle_df.loc[:,['x0', 'x1']].values
y = circle_df.loc[:,'y'].values

In [None]:
plt.scatter(X[y==1, 0],
            X[y==1, 1],
            c='b', marker='x',
            label = 'class label 1')
plt.scatter(X[y==-1, 0],
            X[y==-1, 1],
            c='r', marker='x',
            label = 'class label -1')

plt.legend()
plt.xlabel('X0')
plt.ylabel('X1')
plt.title('Moon dataset')
plt.show()

##### SVM

In [None]:
from sklearn.svm import SVC

svm = SVC(kernel='rbf', gamma=0.40, C=10.0)
svm.fit(X, y)

plot_decision_regions(X,y,classifier=svm)
plt.legend()
plt.xlabel('X0')
plt.ylabel('X1')
plt.title('SVM Classifier')
plt.show()



##### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5, 
                           p=2, 
                           metric='minkowski') # Generalization of Edulidean 
# & Manhattan distance 

svm = SVC(kernel='rbf', gamma=0.40, C=10.0)
svm.fit(X, y)

plot_decision_regions(X,y,classifier=svm)
plt.legend()
plt.xlabel('X0')
plt.ylabel('X1')
plt.title('KNN classifier')
plt.show()

### Home Exercise 