##  Python Data Science

> Introduction to Machine Learning

Kuo, Yao-Jen <yaojenkuo@datainpoint.com> from [DATAINPOINT](https://www.datainpoint.com)

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures

## Given `titanic-train.csv` in working directory, extract `Fare` as the feature matrix and apply `StandardScaler` on it.

- Expected inputs: a CSV file `titanic-train.csv`.
- Expected outputs: a (891, 1) ndarray.

In [2]:
def get_standard_scaled_fare(csv_file):
    """
    >>> standard_scaled_fare = get_standard_scaled_fare('titanic-train.csv')
    >>> print(type(standard_scaled_fare))
    <class 'numpy.ndarray'>
    >>> print(standard_scaled_fare.shape)
    (891, 1)
    >>> print(standard_scaled_fare[:5])
    [[-0.50244517]
     [ 0.78684529]
     [-0.48885426]
     [ 0.42073024]
     [-0.48633742]]
    >>> print(standard_scaled_fare.std())
    1.0
    >>> print(standard_scaled_fare.max())
    9.667166525013505
    >>> print(standard_scaled_fare.min())
    -0.6484216535389205
    """
    ### BEGIN SOLUTION
    ### END SOLUTION

## Following the previous question, what are the mean and standard deviation of `Fare` that the `StandardScaler` adopts?

- Expected inputs: a CSV file `titanic-train.csv`.
- Expected outputs: a dict with length of 2.

In [3]:
def get_fare_mean_std(csv_file):
    """
    >>> fare_mean_std = get_fare_mean_std('titanic-train.csv')
    >>> print(type(fare_mean_std))
    <class 'dict'>
    >>> print(fare_mean_std['mean'])
    32.20420797
    >>> print(fare_mean_std['std'])
    49.66553444
    """
    ### BEGIN SOLUTION
    ### END SOLUTION

## Given `house-prices-train.csv` in working directory, extract `GrLivArea` as the feature matrix and apply `MinMaxScaler` on it. 

- Expected inputs: a CSV file `house-prices-train.csv`.
- Expected outputs: a (1460, 1) ndarray.

In [4]:
def get_min_max_scaled_gr_liv_area(csv_file):
    """
    >>> min_max_scaled_gr_liv_area = get_min_max_scaled_gr_liv_area('house-prices-train.csv')
    >>> print(type(min_max_scaled_gr_liv_area))
    <class 'numpy.ndarray'>
    >>> print(min_max_scaled_gr_liv_area.shape)
    (1460, 1)
    >>> print(min_max_scaled_gr_liv_area)
    [[0.25923135]
     [0.17483044]
     [0.27354936]
     ...
     [0.37792012]
     [0.14016579]
     [0.17370008]]
    >>> print(min_max_scaled_gr_liv_area.max())
    1.0
    >>> print(min_max_scaled_gr_liv_area.min())
    0.0
    """
    ### BEGIN SOLUTION
    ### END SOLUTION

## Following the previous question, what are the min and max values of `GrLivArea` that the `MinMaxScaler` adopts?

- Expected inputs: a CSV file `house-prices-train.csv`.
- Expected outputs: a dict with length of 2.

In [5]:
def get_gr_liv_area_min_max(csv_file):
    """
    >>> gr_liv_area_min_max = get_gr_liv_area_min_max('house-prices-train.csv')
    >>> print(type(gr_liv_area_min_max))
    <class 'dict'>
    >>> print(gr_liv_area_min_max['min'])
    334.0
    >>> print(gr_liv_area_min_max['max'])
    5642.0
    """
    ### BEGIN SOLUTION
    ### END SOLUTION

## Given `house-prices-train.csv` in working directory, extract `GrLivArea` as the feature matrix and apply `PolynomialFeatures` on it with a hyperparameter `degree=3` adding intercepts, linear, and non-linear attributes to the feature matrix. 

- Expected inputs: a CSV file `house-prices-train.csv`.
- Expected outputs: a (1460, 4) ndarray.

In [6]:
def get_polynomial_gr_liv_area(csv_file):
    """
    >>> polynomial_gr_liv_area = get_polynomial_gr_liv_area('house-prices-train.csv')
    >>> print(type(polynomial_gr_liv_area))
    <class 'numpy.ndarray'>
    >>> print(polynomial_gr_liv_area.shape)
    (1460, 4)
    >>> print(polynomial_gr_liv_area)
    array([[1.00000000e+00, 1.71000000e+03, 2.92410000e+06, 5.00021100e+09],
           [1.00000000e+00, 1.26200000e+03, 1.59264400e+06, 2.00991673e+09],
           [1.00000000e+00, 1.78600000e+03, 3.18979600e+06, 5.69697566e+09],
           ...,
           [1.00000000e+00, 2.34000000e+03, 5.47560000e+06, 1.28129040e+10],
           [1.00000000e+00, 1.07800000e+03, 1.16208400e+06, 1.25272655e+09],
           [1.00000000e+00, 1.25600000e+03, 1.57753600e+06, 1.98138522e+09]])
    """
    ### BEGIN SOLUTION
    ### END SOLUTION

## Run tests!

Kernel -> Restart & Run All.

In [7]:
import unittest

class TestUseOfTransformers(unittest.TestCase):
    def test_get_standard_scaled_fare(self):
        standard_scaled_fare = get_standard_scaled_fare('titanic-train.csv')
        self.assertIsInstance(standard_scaled_fare, np.ndarray)
        self.assertEqual(standard_scaled_fare.shape, (891, 1))
        self.assertAlmostEqual(standard_scaled_fare.std(), 1.0)
        self.assertAlmostEqual(standard_scaled_fare.max(), 9.667166525013505)
        self.assertAlmostEqual(standard_scaled_fare.min(), -0.6484216535389205)
    def test_get_fare_mean_std(self):
        fare_mean_std = get_fare_mean_std('titanic-train.csv')
        self.assertIsInstance(fare_mean_std, dict)
        self.assertAlmostEqual(fare_mean_std['mean'], 32.20420797)
        self.assertAlmostEqual(fare_mean_std['std'], 49.66553444)
    def test_get_min_max_scaled_gr_liv_area(self):
        min_max_scaled_gr_liv_area = get_min_max_scaled_gr_liv_area('house-prices-train.csv')
        self.assertIsInstance(min_max_scaled_gr_liv_area, np.ndarray)
        self.assertAlmostEqual(min_max_scaled_gr_liv_area.max(), 1.0)
        self.assertAlmostEqual(min_max_scaled_gr_liv_area.min(), 0.0)
    def test_get_gr_liv_area_min_max(self):
        gr_liv_area_min_max = get_gr_liv_area_min_max('house-prices-train.csv')
        self.assertIsInstance(gr_liv_area_min_max, dict)
        self.assertAlmostEqual(gr_liv_area_min_max['min'], 334.0)
        self.assertAlmostEqual(gr_liv_area_min_max['max'], 5642.0)
    def test_get_polynomial_gr_liv_area(self):
        polynomial_gr_liv_area = get_polynomial_gr_liv_area('house-prices-train.csv')
        self.assertIsInstance(polynomial_gr_liv_area, np.ndarray)
        self.assertEqual(polynomial_gr_liv_area.shape, (1460, 4))
        self.assertAlmostEqual(polynomial_gr_liv_area[:, 0].mean(), 1.0)
        self.assertAlmostEqual(polynomial_gr_liv_area[:, 1].mean(), 1515.463698630137)
        self.assertAlmostEqual(polynomial_gr_liv_area[:, 2].mean(), 2572570.7253424656)
        self.assertAlmostEqual(polynomial_gr_liv_area[:, 3].mean(), 4932874793.497945)

suite = unittest.TestLoader().loadTestsFromTestCase(TestUseOfTransformers)
runner = unittest.TextTestRunner(verbosity=2)
test_results = runner.run(suite)
number_of_failures = len(test_results.failures)
number_of_errors = len(test_results.errors)
number_of_test_runs = test_results.testsRun
number_of_successes = number_of_test_runs - (number_of_failures + number_of_errors)
total_points = number_of_successes * 2

test_get_fare_mean_std (__main__.TestUseOfTransformers) ... FAIL
test_get_gr_liv_area_min_max (__main__.TestUseOfTransformers) ... FAIL
test_get_min_max_scaled_gr_liv_area (__main__.TestUseOfTransformers) ... FAIL
test_get_polynomial_gr_liv_area (__main__.TestUseOfTransformers) ... FAIL
test_get_standard_scaled_fare (__main__.TestUseOfTransformers) ... FAIL

FAIL: test_get_fare_mean_std (__main__.TestUseOfTransformers)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-7-6489ad9cd8a9>", line 13, in test_get_fare_mean_std
    self.assertIsInstance(fare_mean_std, dict)
AssertionError: None is not an instance of <class 'dict'>

FAIL: test_get_gr_liv_area_min_max (__main__.TestUseOfTransformers)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-7-6489ad9cd8a9>", line 23, in test_get_gr_liv_area_min_max
    self.assertIsInstance(gr_liv_

In [8]:
print("You've got {} successes out of {} exercises.".format(number_of_successes, number_of_test_runs))

You've got 0 successes out of 5 exercises.
