In [22]:
import numpy as np
import pandas as pd
import sklearn.datasets as sd
import sklearn.model_selection as sms
import matplotlib.pyplot as plt
import math
import random

# 读取实验数据
x,y = sd.load_svmlight_file('../data/housing_scale.txt',n_features = 13)

# 分割数据集为训练集和验证集
x_train, x_valid, y_train, y_valid = sms.train_test_split(x, y)

# 将稀疏矩阵转为ndarray类型
x_train = x_train.toarray()
x_valid = x_valid.toarray()
y_train = y_train.reshape(len(y_train),1)
y_valid = y_valid.reshape(len(y_valid),1)#转化为1列

x_train = np.concatenate((np.ones((x_train.shape[0],1)), x_train), axis = 1)
x_valid = np.concatenate((np.ones((x_valid.shape[0],1)), x_valid), axis = 1)


# 均方误差损失函数
def mean_squared_error(X, y, theta):
    hx = X.dot(theta)#w点乘X
    error = np.power((hx - y), 2).mean()
    return error

# 平均绝对误差
def mean_absolute_error(X, y, theta):
    hx = X.dot(theta)#w点乘X
    return np.mean(np.abs(hx - y))

# Huber loss
def huber_loss(X, y, theta, delta=1.0):
    hx = X.dot(theta)#w点乘X
    error = np.abs(hx - y)#平均绝对误差
    quadratic_part = np.minimum(error, delta)
    linear_part = error - quadratic_part
    return np.mean(0.5 * np.power(quadratic_part, 2) + delta * linear_part)




# 线性回归闭式解
class Linear_close_form_solver:
    def __init__(self,x_train,x_valid,y_train,y_valid,loss_func):
        self.x_train = x_train
        self.x_valid = x_valid
        self.y_train = y_train
        self.y_valid = y_valid
        self.loss_func = loss_func
        self.theta = 0 # 初始化
        
    # 闭式解函数
    def normal_equation(self, x, y):
        return (np.linalg.inv(x.T.dot(x))).dot(x.T).dot(y)
    
    # 解出闭式解
    def solve(self):
        self.theta = self.normal_equation(self.x_train,self.y_train)
#         print("theta = {}".format(self.theta))
        return self.theta
    
    def print_loss(self):
        train_loss = self.loss_func(self.x_train,self.y_train,self.theta)
        print("train_loss: {}".format(train_loss))
        valid_loss = self.loss_func(self.x_valid,self.y_valid,self.theta)
        print("valid_loss: {}".format(valid_loss))
        
# 均方误差损失        
mean_squared = Linear_close_form_solver(x_train,x_valid,y_train,y_valid,mean_squared_error)
mean_squared.solve()
mean_squared.print_loss()
# 平均绝对误差
mean_absolute = Linear_close_form_solver(x_train,x_valid,y_train,y_valid,mean_absolute_error)
mean_absolute.solve()
mean_absolute.print_loss()
# huber loss
huber_loss = Linear_close_form_solver(x_train,x_valid,y_train,y_valid,huber_loss)
huber_loss.solve()
huber_loss.print_loss()


train_loss: 18.916117446990334
valid_loss: 32.338848364049994
train_loss: 3.130749001198478
valid_loss: 3.5272082346463276
train_loss: 2.675241080953923
valid_loss: 3.0802504005118947
