# Python 機器學習從零至一 

> 類別預測的任務

[數據交點](https://www.datainpoint.com) | 郭耀仁 <yaojenkuo@datainpoint.com>

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

## Instructions

- The assignment will be disconnected if idling over 10 minutes, we can reactivate a new session by clicking the assignment link again.
- We've imported necessary modules at the top of each assignment.
- We've put necessary files(if any) in `/home/jovyan/data`.
- We've defined the names of functions/inputs/parameters for you.
- Write down your solution between the comments `### BEGIN SOLUTION` and `### END SOLUTION`.
- It is NECESSARY to `return` the answer, tests will fail by just printing out the answer.
- It is known that `SyntaxError` and `IndentationError` might break our `test_runner.py` and results in a zero point grade. It is highly recommended testing your solution by calling functions/methods in notebook or running tests before submission.
- Running tests to see if your solutions are right:
    - File -> Save Notebook to save `exercises.ipynb`.
    - File -> New -> Terminal to open a Terminal.
    - Use command `python 04-exercises/test_runner.py` to run test.

## 01. Given `/home/jovyan/data/titanic/test.csv` and `/home/jovyan/data/titanic/train.csv`, create a chimp model to predict `Survived` for each observation in `test.csv`.

- Expected inputs: None.
- Expected outputs: `ndarray`.

In [2]:
def predict_survived_chimp_model() -> np.ndarray:
    """
    >>> survived_chimp_model = predict_survived_chimp_model()
    >>> type(survived_chimp_model)
    'numpy.ndarray'
    >>> survived_chimp_model.shape
    (418,)
    """
    ### BEGIN SOLUTION
    test = pd.read_csv("/home/jovyan/data/titanic/test.csv")
    m = test.shape[0]
    y_hat = np.random.randint(low=0, high=2, size=m)
    return y_hat
    ### END SOLUTION

## 02. Given `/home/jovyan/data/titanic/test.csv` and `/home/jovyan/data/titanic/train.csv`, create an expert model to predict `Survived` according to `Sex` and `Age` for each observation in `test.csv`.

- Expected inputs: None.
- Expected outputs: `ndarray`.

In [3]:
def predict_survived_expert_model() -> np.ndarray:
    """
    >>> survived_expert_model = predict_survived_expert_model()
    >>> type(survived_expert_model)
    'numpy.ndarray'
    >>> survived_expert_model.shape
    (418,)
    """
    ### BEGIN SOLUTION
    test = pd.read_csv("/home/jovyan/data/titanic/test.csv")
    y_hat = []
    for sex, age in zip(test["Sex"], test["Age"]):
        if sex == "female" or age < 12:
            y_hat.append(1)
        else:
            y_hat.append(0)
    return np.array(y_hat)
    ### END SOLUTION

## 03. Given `/home/jovyan/data/titanic/train.csv`, extract `Pclass`, `Sex`, and `Age` as the feature matrix and apply Scikit-Learn's `LogisticRegression` to predict `Survived` with `C=1e6`. Return the intercept and coefficients in an array.

- Expected inputs: None.
- Expected outputs: `ndarray`.

In [4]:
def predict_survived_sklearn_model() -> np.ndarray:
    """
    >>> survived_sklearn_model = predict_survived_sklearn_model()
    >>> type(survived_sklearn_model)
    'numpy.ndarray'
    >>> survived_sklearn_model
    array([ 4.73195566, -1.16846277, -2.61196369, -0.03342722])
    """
    ### BEGIN SOLUTION
    train = pd.read_csv("/home/jovyan/data/titanic/train.csv")
    # Sex
    sex_dict = {
        "female": 0,
        "male": 1
    }
    train_sex_int = train["Sex"].map(sex_dict).values
    # Age
    age_mean = train["Age"].mean()
    train_age_filled = train["Age"].fillna(age_mean).values.reshape(-1, 1)
    X = np.concatenate((train["Pclass"].values.reshape(-1, 1),
                        train_sex_int.reshape(-1, 1),
                        train_age_filled), axis=1)
    y = train["Survived"].values
    model = LogisticRegression(C=1e6)
    model.fit(X, y)
    w = model.coef_
    w = np.insert(w, 0, model.intercept_)
    return w
    ### END SOLUTION

## 04. Given `/home/jovyan/data/titanic/train.csv`, extract `Pclass`, `Sex`, and `Age` as the feature matrix and apply self-defined gradient descent model to predict `Survived`. Return the intercept and coefficients in an array.

- Expected inputs: None.
- Expected outputs: `ndarray`.

In [5]:
def predict_survived_gradient_descent_model() -> np.ndarray:
    """
    >>> survived_gradient_descent_model = predict_survived_gradient_descent_model()
    >>> type(survived_gradient_descent_model)
    'numpy.ndarray'
    >>> survived_gradient_descent_model.size
    (4,)
    >>> survived_gradient_descent_model[1] < 0
    True
    >>> survived_gradient_descent_model[2] < 0
    True
    >>> survived_gradient_descent_model[3] < 0
    True
    """
    ### BEGIN SOLUTION
    train = pd.read_csv("/home/jovyan/data/titanic/train.csv")
    # Sex
    sex_dict = {
        "female": 0,
        "male": 1
    }
    train_sex_int = train["Sex"].map(sex_dict).values
    # Age
    age_mean = train["Age"].mean()
    train_age_filled = train["Age"].fillna(age_mean).values.reshape(-1, 1)
    X = np.concatenate((train["Pclass"].values.reshape(-1, 1),
                        train_sex_int.reshape(-1, 1),
                        train_age_filled), axis=1)
    y = train["Survived"].values
    m = X.shape[0]
    intercepts = np.ones(m).reshape(-1, 1)
    X = np.concatenate((intercepts, X), axis=1)
    n = X.shape[1]
    w = np.random.rand(n)
    epochs = 50000
    learning_rate = 0.01
    epsilon = 1e-6
    for i in range(epochs):
        X_w = np.dot(X, w)
        p_hat = 1 / (1 + np.exp(-X_w))
        cost_y1 = -np.dot(y, np.log(p_hat + epsilon))
        cost_y0 = -np.dot(1 - y, np.log(1 - p_hat + epsilon))
        cross_entropy = (cost_y1 + cost_y0) / m
        X_T = np.transpose(X)
        gradient = (1/m) * np.dot(X_T, p_hat - y)
        if i % 5000 == 0:
            print("epoch: {:6} - loss: {:.6f}".format(i, cross_entropy))
        w -= learning_rate*gradient
    return w.ravel()
    ### END SOLUTION