# Linear Models

## Linear Regression

### Dataset Load - extended_boston Dataset

In [None]:
from mglearn.datasets import load_extended_boston

In [None]:
X, y = load_extended_boston()

In [None]:
# mglearn 미설치 시 사용

# import numpy as np
# data = np.load('./data/extended_boston_dataset.npy', allow_pickle=True)
# X = data[:,:-1]
# y = data[:,-1]

In [None]:
print('shape of X:', X.shape)
print('shape of y', y.shape)
print('y:', y)

### Data Preprocessing

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 0)

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Learning

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
reg_linear = LinearRegression()

In [None]:
reg_linear.fit(X_train_scaled, y_train)

### Inference & Evaluation

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
y_train_hat = reg_linear.predict(X_train_scaled)
print('train MAE: %.5f'%mean_absolute_error(y_train, y_train_hat))
print('train RMSE: %.5f'%mean_squared_error(y_train, y_train_hat)**0.5)
print('train R_square: %.5f'%r2_score(y_train, y_train_hat))

In [None]:
y_test_hat = reg_linear.predict(X_test_scaled)
print('test MAE: %.5f'%mean_absolute_error(y_test, y_test_hat))
print('test RMSE: %.5f'%mean_squared_error(y_test, y_test_hat)**0.5)
print('test R_square: %.5f'%r2_score(y_test, y_test_hat))

## Regularized Linear Regression - Ridge Regression

### Learning

In [None]:
from sklearn.linear_model import Ridge

In [None]:
reg_ridge = Ridge(alpha=1)
reg_ridge.fit(X_train_scaled, y_train)

### Inference & Evaluation

In [None]:
y_train_hat = reg_ridge.predict(X_train_scaled)
print('train MAE: %.5f'%mean_absolute_error(y_train, y_train_hat))
print('train RMSE: %.5f'%mean_squared_error(y_train, y_train_hat)**0.5)
print('train R_square: %.5f'%r2_score(y_train, y_train_hat))

In [None]:
y_test_hat = reg_ridge.predict(X_test_scaled)
print('test MAE: %.5f'%mean_absolute_error(y_test, y_test_hat))
print('test RMSE: %.5f'%mean_squared_error(y_test, y_test_hat)**0.5)
print('test R_square: %.5f'%r2_score(y_test, y_test_hat))

### Hyperparameter search (alpha)

In [None]:
training_r2 =[]
test_r2 = []

In [None]:
alpha_settings = [0, 0.1, 1, 10, 100]

In [None]:
for alpha in alpha_settings:
    reg = Ridge(alpha = alpha)
    reg.fit(X_train_scaled, y_train)

    y_train_hat = reg.predict(X_train_scaled)
    training_r2.append(r2_score(y_train, y_train_hat))

    y_test_hat = reg.predict(X_test_scaled)
    test_r2.append(r2_score(y_test, y_test_hat))

In [None]:
import pandas as pd

In [None]:
result_df = pd.DataFrame({
        'alpha': alpha_settings,
    'train r2': training_r2,
    'test r2': test_r2
})

In [None]:
display(result_df)

## Regularized Linear Regression - Lasso Regression

### Learning

In [None]:
from sklearn.linear_model import Lasso

In [None]:
reg_lasso = Lasso(alpha=1)
reg_lasso.fit(X_train_scaled, y_train)

### Inference & Evaluation

In [None]:
y_train_hat = reg_lasso.predict(X_train_scaled)
print('train MAE: %.5f'%mean_absolute_error(y_train, y_train_hat))
print('train RMSE: %.5f'%mean_squared_error(y_train, y_train_hat)**0.5)
print('train R_square: %.5f'%r2_score(y_train, y_train_hat))

In [None]:
y_test_hat = reg_lasso.predict(X_test_scaled)
print('test MAE: %.5f'%mean_absolute_error(y_test, y_test_hat))
print('test RMSE: %.5f'%mean_squared_error(y_test, y_test_hat)**0.5)
print('test R_square: %.5f'%r2_score(y_test, y_test_hat))

In [None]:
from matplotlib import pyplot as plt
import numpy as np

In [None]:
num_features = X.shape[1]
plt.scatter(np.arange(num_features), reg_linear.coef_, c ='red', alpha=0.5, label='linear regression')
plt.scatter(np.arange(num_features), reg_ridge.coef_, c ='blue', alpha=0.5, label='ridge regression')
plt.scatter(np.arange(num_features), reg_lasso.coef_, c ='green', alpha=0.5, label='lasso regression')

plt.xlabel('Coefficient index')
plt.ylabel('Coefficient magnitude')
plt.legend()
plt.show()

### Hyperparameter search (alpha)

In [None]:
num_vars =[]
training_r2 = []
test_r2 = []

In [None]:
alpha_settings = [0.0001, 0.001, 0.01, 0.1, 1]

In [None]:
for alpha in alpha_settings:
    reg = Lasso(alpha = alpha)
    reg.fit(X_train_scaled, y_train)

    num_vars.append(sum(reg.coef_ != 0))

    y_train_hat = reg.predict(X_train_scaled)
    training_r2.append(r2_score(y_train, y_train_hat))

    y_test_hat = reg.predict(X_test_scaled)
    test_r2.append(r2_score(y_test, y_test_hat))

In [None]:
result_df = pd.DataFrame({
        'alpha': alpha_settings,
        'no. features used': num_vars,
    'train r2': training_r2,
    'test r2': test_r2
})

In [None]:
display(result_df)

## Logistic Regression

### Dataset Load - breast_cancer dataset

In [None]:
from sklearn.datasets import load_breast_cancer

In [None]:
breast_cancer_dataset = load_breast_cancer()

In [None]:
X, y = breast_cancer_dataset.data, breast_cancer_dataset.target

In [None]:
print('shape of X:', X.shape)
print('shape of y', y.shape)
print('y:', y)

### Data Preprocessing

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Learning

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
clf = LogisticRegression(C=1)

In [None]:
clf.fit(X_train_scaled, y_train)

### Inference & Evaluation

In [None]:
y_train_hat = clf.predict(X_train_scaled)
print('ground truth of y_train:', y_train)
print('prediction result of y_train:', y_train_hat)

In [None]:
y_test_hat = clf.predict(X_test_scaled)
print('ground truth of y_test:', y_test)
print('prediction result of y_test:', y_test_hat)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
y_train_accuracy = accuracy_score(y_train, y_train_hat)
print('train_accuracy:', y_train_accuracy)

In [None]:
y_test_accuracy = accuracy_score(y_test, y_test_hat)
print('test_accuracy:', y_test_accuracy)

### Hyperparameter search (C)

In [None]:
training_accuracy = []
test_accuracy = []

In [None]:
C_settings = [0.01, 0.1, 1, 10, 100, 1000, 10000]

In [None]:
for C in C_settings:

    clf = LogisticRegression(C=C)
    clf.fit(X_train_scaled, y_train)

    y_train_hat = clf.predict(X_train_scaled)
    training_accuracy.append(accuracy_score(y_train, y_train_hat))

    y_test_hat = clf.predict(X_test_scaled)
    test_accuracy.append(accuracy_score(y_test, y_test_hat))

In [None]:
result_df = pd.DataFrame({
        'C': C_settings,
    'train accuracy': training_accuracy,
    'test accuracy': test_accuracy
})

In [None]:
display(result_df)