##  Python Data Science

> Introduction to Machine Learning

Kuo, Yao-Jen <yaojenkuo@datainpoint.com> from [DATAINPOINT](https://www.datainpoint.com)

In [1]:
import numpy as np
import pandas as pd
import sklearn.linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.svm import SVC
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

## Given `house-prices-train.csv` in working directory, extract `OverallQual` and `GrLivArea` as the feature matrix, `SalePrice` as the target vector. Split both target vector and feature matrix with 80% as training set, 20% as validation set. Use a hyperparameter `random_state=42` to fix the randomness of split. Apply `LinearRegression`, `Ridge`, and `XGBRegressor` on the training set to generate 3 fitted models. Use the model to predict the `SalePrice` for validation set and measure the mean squared error of the 3 models. Log the metrics with a dict.

- Expected inputs: a CSV file `house-prices-train.csv`.
- Expected outputs: a dict of length 3.

In [2]:
def get_models_mse(csv_file):
    """
    >>> models_mse = get_models_mse('house-prices-train.csv')
    >>> print(type(models_mse))
    <class 'dict'>
    >>> print(len(models_mse))
    3
    >>> print(models_mse['LinearRegression'])
    1950070708.6601994
    >>> print(models_mse['Ridge'])
    1950170046.4195352
    >>> print(models_mse['XGBRegressor'])
    1536931894.3172483
    """
    ### BEGIN SOLUTION
    df = pd.read_csv(csv_file)
    X = df[['OverallQual', 'GrLivArea']].values
    y = df['SalePrice'].values
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
    model_names = ['LinearRegression', 'Ridge', 'XGBRegressor']
    models = [LinearRegression(), Ridge(), XGBRegressor()]
    metrics = dict()
    for model, model_name in zip(models, model_names):
        model.fit(X_train, y_train)
        y_hat = model.predict(X_valid)
        metrics[model_name] = mean_squared_error(y_valid, y_hat)
    return metrics
    ### END SOLUTION

## Following the previous question, choose the best one among 3 models(the lower mean squared error, the better). Use the best model to predict `SalePrice` for `house-prices-test.csv`. Generate a (1459, 2) DataFrame that is ready for submission.

- Expected inputs: 2 CSV files `house-prices-train.csv` and `house-prices-test.csv`.
- Expected outputs: a (1459, 2) DataFrame.

In [3]:
def predict_sale_price(train_csv_file, test_csv_file):
    """
    >>> sale_price = predict_sale_price('house-prices-train.csv', 'house-prices-test.csv')
    >>> print(type(sale_price))
    <class 'pandas.core.frame.DataFrame'>
    >>> print(sale_price.shape)
    (1459, 2)
    >>> print(sale_price)
            Id      SalePrice
    0     1461  127400.171875
    1     1462  160125.781250
    2     1463  145218.718750
    3     1464  165095.468750
    4     1465  219274.078125
    ...    ...            ...
    1454  2915   85581.273438
    1455  2916   85581.273438
    1456  2917  140714.546875
    1457  2918  129047.656250
    1458  2919  219292.000000

    [1459 rows x 2 columns]
    """
    ### BEGIN SOLUTION
    df = pd.read_csv(train_csv_file)
    df_test = pd.read_csv(test_csv_file)
    X = df[['OverallQual', 'GrLivArea']].values
    X_test = df_test[['OverallQual', 'GrLivArea']].values
    y = df['SalePrice'].values
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
    model = XGBRegressor()
    model.fit(X_train, y_train)
    y_hat = model.predict(X_test)
    Id = df_test['Id'].values
    df_submission = pd.DataFrame()
    df_submission['Id'] = Id
    df_submission['SalePrice'] = y_hat
    return df_submission
    ### END SOLUTION

## Given `titanic-train.csv` in working directory, extract `Pclass`, and `Sex` as the feature matrix, `Survived` as the target vector. Encode `male` as 0, `female` as 1. Split both target vector and feature matrix with 80% as training set, 20% as validation set. Use a hyperparameter `random_state=42` to fix the randomness of split. Apply `LogisticRegression`, `SVC(kernel='linear', probability=True)`, and `XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')` on the training set to generate 3 fitted models. Use the model to predict the `Survived` for validation set and measure the accuracy score of the 3 models. Log the metrics with a dict.

- Expected inputs: a CSV file `titanic-train.csv`.
- Expected outputs: a dict of length 3.

In [4]:
def get_models_accuracy_score(csv_file):
    """
    >>> models_accuracy_score = get_models_accuracy_score('titanic-train.csv')
    >>> print(type(model_accuracy_score))
    <class 'dict'>
    >>> print(models_accuracy_score['LogisticRegression'])
    0.7821229050279329
    >>> print(models_accuracy_score['SVC'])
    0.7821229050279329
    >>> print(models_accuracy_score['XGBClassifier'])
    0.8268156424581006
    """
    ### BEGIN SOLUTION
    sex_dict = {
        'male': 0,
        'female': 1
    }
    df = pd.read_csv(csv_file)
    df['Sex_Encoded'] = df['Sex'].map(sex_dict)
    X = df[['Fare', 'Pclass', 'Sex_Encoded']].values
    y = df['Survived'].values
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
    model_names = ['LogisticRegression', 'SVC', 'XGBClassifier']
    models = [LogisticRegression(), SVC(kernel='linear', probability=True), XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')]
    metrics = dict()
    for model, model_name in zip(models, model_names):
        model.fit(X_train, y_train)
        y_hat = model.predict(X_valid)
        metrics[model_name] = accuracy_score(y_valid, y_hat)
    return metrics
    ### END SOLUTION

## Following the previous question, choose the best one among 3 models(the higher accuracy score, the better). Use the best model to predict `Survived` for `titanic-test.csv`. Generate a (418, 2) DataFrame that is ready for submission.

- Expected inputs: 2 CSV files `titanic-train.csv` and `titanic-test.csv`.
- Expected outputs: a (418, 2) DataFrame.

In [5]:
def predict_survived(train_csv_file, test_csv_file):
    """
    >>> survived = predict_survived('titanic-train.csv', 'titanic-test.csv')
    >>> print(type(survived))
    <class 'pandas.core.frame.DataFrame'>
    >>> print(survived.shape)
    (418, 2)
    >>> print(survived)
         PassengerId  Survived
    0            892         0
    1            893         1
    2            894         0
    3            895         0
    4            896         1
    ..           ...       ...
    413         1305         0
    414         1306         1
    415         1307         0
    416         1308         0
    417         1309         0

    [418 rows x 2 columns]
    """
    ### BEGIN SOLUTION
    sex_dict = {
        'male': 0,
        'female': 1
    }
    df = pd.read_csv(train_csv_file)
    df_test = pd.read_csv(test_csv_file)
    df['Sex_Encoded'] = df['Sex'].map(sex_dict)
    df_test['Sex_Encoded'] = df_test['Sex'].map(sex_dict)
    # missing value impute
    mean_fare = (df['Fare'].sum() + df_test['Fare'].sum()) / (df['Fare'].size + df_test['Fare'].notnull().sum())
    df_test['Fare_Imputed'] = df_test['Fare'].fillna(mean_fare)
    X = df[['Fare', 'Pclass', 'Sex_Encoded']].values
    X_test = df_test[['Fare_Imputed', 'Pclass', 'Sex_Encoded']].values
    y = df['Survived'].values
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
    model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    model.fit(X_train, y_train)
    y_hat = model.predict(X_test)
    passenger_id = df_test['PassengerId'].values
    df_submission = pd.DataFrame()
    df_submission['PassengerId'] = passenger_id
    df_submission['Survived'] = y_hat
    return df_submission
    ### END SOLUTION

## Run tests!

Kernel -> Restart & Run All.

In [6]:
import unittest

class TestCompareModels(unittest.TestCase):
    def test_get_models_mse(self):
        models_mse = get_models_mse('house-prices-train.csv')
        self.assertIsInstance(models_mse, dict)
        self.assertEqual(len(models_mse), 3)
        self.assertTrue('LinearRegression' in models_mse)
        self.assertTrue('Ridge' in models_mse)
        self.assertTrue('XGBRegressor' in models_mse)
        
    def test_predict_sale_price(self):
        sale_price = predict_sale_price('house-prices-train.csv', 'house-prices-test.csv')
        self.assertIsInstance(sale_price, pd.core.frame.DataFrame)
        self.assertEqual(sale_price.shape, (1459, 2))
        
    def test_get_models_accuracy_score(self):
        models_accuracy_score = get_models_accuracy_score('titanic-train.csv')
        self.assertIsInstance(models_accuracy_score, dict)
        self.assertEqual(len(models_accuracy_score), 3)
        self.assertTrue('LogisticRegression' in models_accuracy_score)
        self.assertTrue('SVC' in models_accuracy_score)
        self.assertTrue('XGBClassifier' in models_accuracy_score)
        
    def test_predict_survived(self):
        survived = predict_survived('titanic-train.csv', 'titanic-test.csv')
        self.assertIsInstance(survived, pd.core.frame.DataFrame)
        self.assertEqual(survived.shape, (418, 2))

suite = unittest.TestLoader().loadTestsFromTestCase(TestCompareModels)
runner = unittest.TextTestRunner(verbosity=2)
test_results = runner.run(suite)
number_of_failures = len(test_results.failures)
number_of_errors = len(test_results.errors)
number_of_test_runs = test_results.testsRun
number_of_successes = number_of_test_runs - (number_of_failures + number_of_errors)
total_points = number_of_successes * 2

test_get_models_accuracy_score (__main__.TestCompareModels) ... ok
test_get_models_mse (__main__.TestCompareModels) ... ok
test_predict_sale_price (__main__.TestCompareModels) ... ok
test_predict_survived (__main__.TestCompareModels) ... ok

----------------------------------------------------------------------
Ran 4 tests in 68.277s

OK


In [7]:
print("You've got {} successes out of {} exercises.".format(number_of_successes, number_of_test_runs))

You've got 4 successes out of 4 exercises.


## Generate submission CSV files.

In [8]:
sale_price = predict_sale_price('house-prices-train.csv', 'house-prices-test.csv')
survived = predict_survived('titanic-train.csv', 'titanic-test.csv')
sale_price.to_csv('house-prices-submission.csv', index=False)
survived.to_csv('titanic-submission.csv', index=False)