# Python 機器學習從零至一 

> 數值預測的任務

[數據交點](https://www.datainpoint.com) | 郭耀仁 <yaojenkuo@datainpoint.com>

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

## Instructions

- The assignment will be disconnected if idling over 10 minutes, we can reactivate a new session by clicking the assignment link again.
- We've imported necessary modules at the top of each assignment.
- We've put necessary files(if any) in `/home/jovyan/data`.
- We've defined the names of functions/inputs/parameters for you.
- Write down your solution between the comments `### BEGIN SOLUTION` and `### END SOLUTION`.
- It is NECESSARY to `return` the answer, tests will fail by just printing out the answer.
- It is known that `SyntaxError` and `IndentationError` might break our `test_runner.py` and results in a zero point grade. It is highly recommended testing your solution by calling functions/methods in notebook or running tests before submission.
- Running tests to see if your solutions are right:
    - File -> Save Notebook to save `exercises.ipynb`.
    - File -> New -> Terminal to open a Terminal.
    - Use command `python 03-exercises/test_runner.py` to run test.

## 01. Given `/home/jovyan/data/house-prices/test.csv` and `/home/jovyan/data/house-prices/train.csv`, create a chimp model to predict `SalePrice` for each observation in `test.csv`.

- Expected inputs: None.
- Expected outputs: `ndarray`.

In [2]:
def predict_sale_price_chimp_model() -> np.ndarray:
    """
    >>> sale_price_chimp_model = predict_sale_price_chimp_model()
    >>> type(sale_price_chimp_model)
    'numpy.ndarray'
    >>> sale_price_chimp_model.shape
    (1459,)
    """
    ### BEGIN SOLUTION
    train = pd.read_csv("/home/jovyan/data/house-prices/train.csv")
    test = pd.read_csv("/home/jovyan/data/house-prices/test.csv")
    sale_price_max = train["SalePrice"].max()
    sale_price_min = train["SalePrice"].min()
    m = test.shape[0]
    y_hat = np.random.randint(low=sale_price_min, high=sale_price_max, size=m)
    return y_hat
    ### END SOLUTION

## 02. Given `/home/jovyan/data/house-prices/test.csv` and `/home/jovyan/data/house-prices/train.csv`, create an expert model to predict `SalePrice` according to `OverallQual` for each observation in `test.csv`.

- Expected inputs: None.
- Expected outputs: `ndarray`.

In [3]:
def predict_sale_price_expert_model() -> np.ndarray:
    """
    >>> sale_price_expert_model = predict_sale_price_expert_model()
    >>> type(sale_price_expert_model)
    'numpy.ndarray'
    >>> sale_price_expert_model.shape
    (1459,)
    """
    ### BEGIN SOLUTION
    train = pd.read_csv("/home/jovyan/data/house-prices/train.csv")
    test = pd.read_csv("/home/jovyan/data/house-prices/test.csv")
    mean_sale_price_by_overall_qual = train.groupby("OverallQual")["SalePrice"].mean()
    y_hat = test["OverallQual"].map(mean_sale_price_by_overall_qual).values
    return y_hat
    ### END SOLUTION

## 03. Given `/home/jovyan/data/house-prices/train.csv`, extract `GrLivArea` and `OverallQual` as the feature matrix and apply Scikit-Learn's `LinearRegression` to predict `SalePrice`. Return the intercept and coefficients in an array.

- Expected inputs: None.
- Expected outputs: `ndarray`.

In [4]:
def predict_sale_price_sklearn_model() -> np.ndarray:
    """
    >>> sale_price_sklearn_model = predict_sale_price_sklearn_model()
    >>> type(sale_price_sklearn_model)
    'numpy.ndarray'
    >>> sale_price_sklearn_model
    array([18569.02585649,   107.13035897])
    """
    ### BEGIN SOLUTION
    train = pd.read_csv("/home/jovyan/data/house-prices/train.csv")
    X = train["GrLivArea"].values.reshape(-1, 1)
    y = train["SalePrice"].values
    model = LinearRegression()
    model.fit(X, y)
    w = model.coef_
    w = np.insert(w, 0, model.intercept_)
    return w
    ### END SOLUTION

## 04. Given `/home/jovyan/data/house-prices/train.csv`, extract `GrLivArea` and `OverallQual` as the feature matrix and apply your self-defined normal equation model to predict `SalePrice`. Return the intercept and coefficients in an array.

- Expected inputs: None.
- Expected outputs: `ndarray`.

In [5]:
def predict_sale_price_normal_equation_model() -> np.ndarray:
    """
    >>> sale_price_normal_equation_model = predict_sale_price_normal_equation_model()
    >>> type(sale_price_normal_equation_model)
    'numpy.ndarray'
    >>> sale_price_normal_equation_model
    array([18569.02585649,   107.13035897])
    """
    ### BEGIN SOLUTION
    train = pd.read_csv("/home/jovyan/data/house-prices/train.csv")
    X = train["GrLivArea"].values.reshape(-1, 1)
    m = X.shape[0]
    intercepts = np.ones(m).reshape(-1, 1)
    X = np.concatenate((intercepts, X), axis=1)
    y = train["SalePrice"].values
    X_T = np.transpose(X)
    left_matrix = np.dot(X_T, X)
    right_matrix = np.dot(X_T, y)
    left_matrix_inv = np.linalg.inv(left_matrix)
    w = np.dot(left_matrix_inv, right_matrix)
    return w.ravel()
    ### END SOLUTION

## 05. Given `/home/jovyan/data/nba/player_stats.csv` extract `apg` and `rpg` as the feature matrix and apply Scikit-Learn's `LinearRegression` to predict `heightMeters`. Return the intercept and coefficients in an array.

- Expected inputs: None.
- Expected outputs: `ndarray`.

In [6]:
def predict_height_meters_sklearn_model() -> np.ndarray:
    """
    >>> height_meters_sklearn_model = predict_height_meters_sklearn_model()
    >>> type(height_meters_sklearn_model)
    'numpy.ndarray'
    >>> height_meters_sklearn_model
    array([ 1.95251514, -0.02754199,  0.02407157])
    """
    ### BEGIN SOLUTION
    player_stats = pd.read_csv("/home/jovyan/data/nba/player_stats.csv")
    X = player_stats[["apg", "rpg"]].values
    y = player_stats["heightMeters"].values
    model = LinearRegression()
    model.fit(X, y)
    w = model.coef_
    w = np.insert(w, 0, model.intercept_)
    return w
    ### END SOLUTION

## 06. Given `/home/jovyan/data/nba/player_stats.csv` extract `apg` and `rpg` as the feature matrix and apply self-defined gradient descent model to predict `heightMeters`. Return the intercept and coefficients in an array.

- Expected inputs: None.
- Expected outputs: `ndarray`.

In [7]:
def predict_height_meters_gradient_descent_model() -> np.ndarray:
    """
    >>> height_meters_grdient_descent_model = predict_height_meters_gradient_descent_model()
    >>> type(height_meters_gradient_descent_model)
    'numpy.ndarray'
    >>> height_meters_gradient_descent_model.size
    3
    >>> height_meters_gradient_descent_model[1] < 0
    True
    >>> height_meters_gradient_descent_model[2] > 0
    True
    """
    ### BEGIN SOLUTION
    player_stats = pd.read_csv("/home/jovyan/data/nba/player_stats.csv")
    X = player_stats[["apg", "rpg"]].values
    m = X.shape[0]
    intercepts = np.ones(m).reshape(-1, 1)
    X = np.concatenate((intercepts, X), axis=1)
    n = X.shape[1]
    y = player_stats["heightMeters"].values
    w = np.random.rand(n)
    epochs = 10000
    learning_rate=0.001
    for i in range(epochs):
        y_hat = np.dot(X, w)
        mse = ((y_hat - y).T.dot(y_hat - y)) / m
        gradient = (2/m) * np.dot(X.T, y_hat - y)
        if i % 1000 == 0:
            print("epoch: {:6} - loss: {:.6f}".format(i, mse))
        w -= learning_rate*gradient
    return w.ravel()
    ### END SOLUTION