# Some step you do when using supervised learning

## 1. Loading dataset

In [1]:
from sklearn.datasets import load_iris, load_diabetes
from sklearn.svm import SVR, SVC
from sklearn.metrics import accuracy_score, mean_squared_error
import pandas as pd
import numpy as np
import random


iris = load_iris() #for classification
diabetes = load_diabetes() #for regression
print(diabetes['DESCR'])

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, T-Cells (a type of white blood cells)
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, thyroid stimulating hormone
      - s5      ltg, lamotrigine
      - s6      glu, blood sugar level

Note: Each of these 10 feature va

In [2]:
print(iris['DESCR'])

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [3]:
feature_iris = iris['data']
label_iris = iris['target']

feature_diabetes = diabetes['data']
label_diabetes = diabetes['target']

#randomizer
idx_iris_random = np.arange(len(label_iris))
idx_diabetes_random = np.arange(len(label_diabetes))
random.shuffle(idx_iris_random)
random.shuffle(idx_diabetes_random)

feature_iris = feature_iris[idx_iris_random]
label_iris = label_iris[idx_iris_random]

feature_diabetes = feature_diabetes[idx_diabetes_random]
label_diabetes = label_diabetes[idx_diabetes_random]

#divide dataset into 4 parts
train_feature_iris = feature_iris[:-30]
train_label_iris = label_iris[:-30]
test_feature_iris = feature_iris[-30:]
test_label_iris = label_iris[-30:]

train_feature_diabetes = feature_diabetes[:-30]
train_label_diabetes = label_diabetes[:-30]
test_feature_diabetes = feature_diabetes[-30:]
test_label_diabetes = label_diabetes[-30:]

print(len(train_label_iris), len(test_label_iris))
print(len(train_label_diabetes), len(test_label_diabetes))

120 30
412 30


## 2. Rescaling the dataset

In [4]:
iris_feature_mean = np.zeros(len(feature_iris[0]))
diabetes_feature_mean = np.zeros(len(feature_diabetes[0]))

iris_feature_sd = np.zeros(len(feature_iris[0]))
diabetes_feature_sd = np.zeros(len(feature_diabetes[0]))

train_feature_iris_res = np.copy(train_feature_iris)
test_feature_iris_res = np.copy(test_feature_iris)

#train_feature_diabetes_res = np.copy(train_feature_diabetes)
#test_feature_diabetes_res = np.copy(test_feature_diabetes)

for i in range(len(feature_iris[0])):
    iris_feature_mean[i] = np.mean(train_feature_iris[:,i])
    iris_feature_sd[i] = np.std(train_feature_iris[:,i])
    train_feature_iris_res[:,i] = (train_feature_iris[:,i] - iris_feature_mean[i]) / iris_feature_sd[i]
    test_feature_iris_res[:,i] = (test_feature_iris[:,i] - iris_feature_mean[i]) / iris_feature_sd[i]

for i in range(len(feature_diabetes[0])):
    diabetes_feature_mean[i] = np.mean(train_feature_diabetes[:,i])
    diabetes_feature_sd[i] = np.std(train_feature_diabetes[:,i])
#    train_feature_diabetes_res[:,i] = (train_feature_diabetes - diabetes_feature_mean[i]) / diabetes_feature_sd[i]
#    test_feature_diabetes_res = (test_feature_diabetes - diabetes_feature_mean[i]) / diabetes_feature_sd[i]

diabetes_label_mean = np.mean(train_label_diabetes)
diabetes_label_sd = np.std(train_label_diabetes)

train_label_diabetes_res = (train_label_diabetes - diabetes_label_mean) / diabetes_label_sd
test_label_diabetes_res = (test_label_diabetes - diabetes_label_mean) / diabetes_label_sd

## 2,5. Encoding categorial data (if any)

## 3. Define the model

In [5]:
iris_model = SVC(gamma='auto')
diabetes_model = SVR(C=1.0, epsilon=0.2)

## 4. Fitting/Training the model

In [6]:
iris_model.fit(train_feature_iris_res, train_label_iris)
diabetes_model.fit(train_feature_diabetes, train_label_diabetes_res)

SVR(epsilon=0.2)

## 5. Predict using fitted model

In [7]:
iris_prediction = iris_model.predict(test_feature_iris_res)
diabetes_prediction_sr = diabetes_model.predict(test_feature_diabetes)

diabetes_prediction = diabetes_prediction_sr * diabetes_label_sd + diabetes_label_mean
diabetes_prediction = np.around(diabetes_prediction)

print('iris prediction and test label')
print(iris_prediction)
print(test_label_iris)
print()
print('diabetes prediction and test label')
print(diabetes_prediction)
print(test_label_diabetes)


iris prediction and test label
[2 0 1 1 2 0 0 0 2 1 1 0 2 2 2 0 2 0 1 0 0 1 0 0 0 2 1 0 1 1]
[2 0 1 1 2 0 0 0 1 1 1 0 2 2 2 0 2 0 1 0 0 2 0 0 0 2 1 0 1 1]

diabetes prediction and test label
[170. 192. 136. 136.  99.  64. 171. 136. 110. 172. 101. 154. 173. 119.
 199. 100. 135. 114.  97. 202. 144. 112. 167.  78. 102.  63. 105. 274.
  71. 148.]
[122.  90. 185. 142. 200.  51. 110. 146.  96. 244.  37. 170. 200.  88.
 273. 182.  97.  72.  99. 198. 155. 144.  91.  75.  50.  83.  64. 264.
  78. 163.]


## 6. Calculate the model performance

In [8]:
#calc iris prediction accuracy
iris_accuracy = accuracy_score(test_label_iris, iris_prediction)
print('iris accuracy', iris_accuracy)

#calc diabetes prediction rmse
diabetes_rmse = mean_squared_error(diabetes_prediction, test_label_diabetes, squared=False)
print('diabetes rmse', diabetes_rmse)

iris accuracy 0.9333333333333333
diabetes rmse 47.70569497799328
