### Linear Regression with Regularization

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets, preprocessing, linear_model
from sklearn.model_selection import train_test_split

In [None]:
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)

data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]
X = data
y = target
X_norm = preprocessing.minmax_scale(X)
X = X_norm

print("shape of X =", X.shape)
print("shape of y =", y.shape)
# print("feature names:", data["feature_names"])

In [None]:
# df = pd.DataFrame(data['data'], columns=data["feature_names"])
# for col in df.columns:
#     df[col+"2"] = df[col]**2
#     df[col+"3"] = df[col]**3
# df.head()

# X = df.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=100)


model = linear_model.Ridge(alpha=0.5)
model = model.fit(X_train, y_train)
y_predict = model.predict(X_test)

# measure performance
mse_train = np.mean((y_train - model.predict(X_train))**2)
mse_test = np.mean((y_test - y_predict)**2)
print("MSE on training data=", mse_train)
print("MSE on testing data=", mse_test)

### Hyperparamter selection - validation set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=100)

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.15, random_state=100)

print("X_train shape = ", X_train.shape)
print("X_valid shape = ", X_valid.shape)
print("X_test shape = ", X_test.shape)

In [None]:
alpha_test = [0, 0.01, 0.1, 1, 10, 100, 1000, 10000]
mse_validation_all = {}
for alpha in alpha_test:
    model = linear_model.Ridge(alpha=alpha)
    model = model.fit(X_train, y_train)
    y_pred_valid = model.predict(X_valid)
    mse_validation = np.mean((y_pred_valid - y_valid)**2)
    mse_validation_all[alpha] = mse_validation

In [None]:
mse_validation_all

In [None]:
final_model = linear_model.Ridge(alpha=0.0)
final_model = final_model.fit(X_train, y_train)
y_predict = final_model.predict(X_test)

# measure performance
mse_train = np.mean((y_train - final_model.predict(X_train))**2)
mse_test = np.mean((y_test - y_predict)**2)
print("MSE on training data=", mse_train)
print("MSE on testing data=", mse_test)

# Classification

## Example 4 - Logistic Regression, Breast Cancer Classification

In [None]:
data = datasets.load_breast_cancer()
X = data['data']
y = data['target']

print("shape of X =", X.shape)
print("shape of y =", y.shape)
print("feature names:", data["feature_names"])

In [None]:
y

In [None]:
df = pd.DataFrame(data['data'], columns=data["feature_names"])
df.head()

### Normalization

In [None]:
X_norm = preprocessing.minmax_scale(X)
df = pd.DataFrame(X_norm, columns=data["feature_names"])
df.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.20, random_state=100)

In [None]:
model = linear_model.LogisticRegression(C=100, fit_intercept=True, solver='lbfgs', max_iter=100)
model = model.fit(X_train, y_train)
y_predict = model.predict(X_test)

acc_training = np.mean(y_train == model.predict(X_train))*100
acc_testing = np.mean(y_test == y_predict)*100
print("Training accuracy (%) =", acc_training)
print("Testing accuracy (%) =", acc_testing)

In [None]:
y_test

In [None]:
y_predict