# Supervised Learning

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import sklearn
sklearn.set_config(print_changed_only=True)

In [2]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
blood = fetch_openml('blood-transfusion-service-center')

X_train, X_test, y_train, y_test = train_test_split(
    blood.data, blood.target, random_state=0)

In [3]:
X_train.shape

(561, 4)

In [4]:
import pandas as pd
pd.Series(y_train).value_counts()

1    438
2    123
dtype: int64

In [5]:
pd.Series(y_train).value_counts(normalize=True)

1    0.780749
2    0.219251
dtype: float64

Really Simple API
-------------------
0) Import your model class

In [6]:
from sklearn.svm import LinearSVC

1) Instantiate an object and set the parameters

In [7]:
svm = LinearSVC()

2) Fit the model

In [8]:
svm.fit(X_train, y_train)



LinearSVC()

3) Apply / evaluate

In [None]:
print(svm.predict(X_train))
print(y_train)

In [None]:
svm.score(X_train, y_train)

In [None]:
svm.score(X_test, y_test)

And again
---------

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier()

In [None]:
rf.fit(X_train, y_train)

In [None]:
rf.score(X_train, y_train)

In [None]:
rf.score(X_test, y_test)

# Exercises

## Exercise 1
Load the iris dataset from the ``sklearn.datasets`` module using the ``load_iris`` function.

Split it into training and test set using ``train_test_split``.

## Exercise 2
Then train an evaluate ``sklearn.neighbors.KNeighborsClassifier``, the RandomForestClassifier and  ``sklearn.linear_model.LogisticRegression`` on the iris dataset.
How do these perform on the training set vs the test set? Which one is the best on the training set, which one is the best on the test set?


**Exercise 1**

In [1]:
from sklearn.datasets import load_iris
iris = load_iris()
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [2]:
import pandas as pd
x = pd.DataFrame(iris.data,columns=['sepal_length','speal_width','petal_length','petal_width'])

In [3]:
x

Unnamed: 0,sepal_length,speal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [4]:
y = pd.Series(iris.target)
y.value_counts()

2    50
1    50
0    50
dtype: int64

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y,random_state=0)

In [6]:
X_train.shape

(112, 4)

In [7]:
X_test.shape

(38, 4)

**Exercise 2**

In [8]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=2)
neigh.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=2, p=2,
                     weights='uniform')

In [9]:
neigh.score(X_test,y_test)

0.9736842105263158

**Random Forest**

In [10]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [11]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [12]:
rf.score(X_train, y_train)

1.0

In [13]:
rf.score(X_test, y_test)

0.9736842105263158

**Logistic Regression**

In [14]:
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression()

In [15]:
lg.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
lg.score(X_test,y_test)

0.9736842105263158