In [879]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import random
#from sklearn import linear_model

data = pd.read_excel('https://archive.ics.uci.edu/ml/machine-learning-databases/00242/ENB2012_data.xlsx')
data_zscore = data.copy()
data = data.dropna()#Drop any line that includes NULL
data = data.drop_duplicates()#Drop any duplicate data
cols = data.columns
#USe z_score to standardize and remove the outliers
for col in cols:
    data_col = data[col]
    z_score = (data_col - data_col.mean()) / data_col.std()#Standardize the data
    data_zscore[col] = z_score.abs()>2
data_drop = data
for col in cols:
    data_drop = data_drop[data_zscore[col] == False]
df = data_drop.reset_index(drop=True)#df is the dataset we are going to analyse
m = 0
print(df.describe())

for col in df.columns:#Normalization of the data
  df[col]=(df[col].subtract(df[col].mean())).div(df[col].std()).round(3)

print(df.describe())
print(df)

               X1         X2          X3          X4          X5          X6  \
count  613.000000  613.00000  613.000000  613.000000  613.000000  613.000000   
mean     0.740930  690.71615  309.747145  190.484502    4.807504    3.499184   
std      0.092623   81.93815   34.856517   39.476913    1.694515    1.118216   
min      0.620000  563.50000  245.000000  122.500000    3.500000    2.000000   
25%      0.660000  612.50000  294.000000  147.000000    3.500000    3.000000   
50%      0.710000  710.50000  318.500000  220.500000    3.500000    3.000000   
75%      0.820000  759.50000  343.000000  220.500000    7.000000    4.000000   
max      0.900000  808.50000  367.500000  220.500000    7.000000    5.000000   

               X7          X8          Y1          Y2  
count  613.000000  613.000000  613.000000  613.000000  
mean     0.229527    2.810767   19.542049   22.026623  
std      0.133214    1.558263    8.900325    8.437464  
min      0.000000    0.000000    6.010000   10.900000  

  data_drop = data_drop[data_zscore[col] == False]


In [880]:
class TrainingSetGenerator:
    def __init__(self):
        pass

    def generate(self, data, percent):#percent indicates what percent of data is for training the model
        data = data.sample(frac=1.0,random_state = random.randint(1,1000)) #randomize the data
        data = data.reset_index(drop=True)  #reset the index
        #print(data)

        #divide the data into train sets and test sets
        select_columns_x = data.columns[0:8]
        select_columns_y = data.columns[8:10]
        data_x = data[select_columns_x]
        data_y = data[select_columns_y]
        count = int(data.shape[0]*percent)
        self.data_train_x = data_x[0:count+1].values
        self.data_train_y = data_y[0:count+1].values
        self.data_test_x = data_x[count+1:data.shape[0]].values
        self.data_test_y = data_y[count+1:data.shape[0]].values


In [881]:
class LinearRegression:
    def __init__(self, bias=True):
        self.bias = bias
        self.mse1 = 0
        self.mse2 = 0
        pass
    def fit(self, x, y):
        N = x.shape[0]
        if(self.bias):
            x = np.column_stack([np.ones(N),x])#add one in the front to create the X matrix
        self.w = np.linalg.inv(x.T @ x)@x.T@y#Calculating the least square

    def predict(self,x):
        N = x.shape[0]
        if(self.bias):
            x = np.column_stack([np.ones(N),x])
        yh = x@self.w
        return yh

    def predict_first(self, X):
        N = X.shape[0]
        D = X.shape[1]
        data = np.c_[np.ones(N), X]
        return data @ self.w[:,0].reshape(data.shape[1],1)

    def predict_second(self, X):
        N = X.shape[0]
        D = X.shape[1]
        data = np.c_[np.ones(N), X]
        return data @ self.w[:,1].reshape(data.shape[1],1)
        

    def meanSquare(self, X, Y):
        Y1 = Y[:,0]
        Y2 = Y[:,1]
        Y1 = Y1.reshape(X.shape[0],1)
        Y2 = Y2.reshape(X.shape[0],1)
        error1 = Y1-self.predict_first(X)
        error2 = Y2-self.predict_second(X)
        print("The error for Y1 is: " + str((error1*error1).mean()))
        print("The error for Y2 is: " + str((error2*error2).mean()))
        self.mse1 += (error1*error1).mean()
        self.mse2 += (error2*error2).mean()

    def reset(self):
        self.mse1 = 0
        self.mse2 = 0


generator = TrainingSetGenerator()
generator.generate(df,0.5)
model = LinearRegression()

In [882]:
class LinearRegression_Optimizer:
    def __init__(self, max_iters, rate=0.01, e=1e-8, batch_size=0, alg=0, m=0):
        self.grad = None    #Gradient
        self.w = None   #weight
        self.max_iters = max_iters  #max iterating times
        self.r = rate   #learning rate
        self.rate = self.r  #Learning rate
        self.e = e  #epsilon
        self.t = 0  #times counter
        self.list = []  #randomize for SGD
        self.batch_size = batch_size    #batch size
        self.sigma = None   #sigma for adagrad
        self.mse = np.inf   #mean square error
        self.ep = 1e-10 #'bias'
        self.momentum = m   #Momentum
        self.lastV = 0  #last rate*gradient
        self.firstWeight = None
        self.secondWeight = None
        if alg == 0:    #choose algorithm: simple(change batch_size to use SGD), adaptive, adaptivegradient
            self.fitw = self.simple
        elif alg == 1:
            self.fitw = self.adaptive
        elif alg == 2:
            self.fitw = self.adagrad
        else:
            self.fitw = self.simple

    def fit_twice(self, X, Y):  #fit the data with Y1 and Y2
        Y1 = Y[:,0]
        Y2 = Y[:,1]
        self.fit(X, Y1)
        self.firstWeight = self.w
        self.fit(X, Y2)
        self.secondWeight = self.w
        w = np.append(self.firstWeight, self.secondWeight, axis=1)
        print(w)
        return w

    def fit(self, X, Y):    #fit the data
        #data reset
        self.t = 0
        N = X.shape[0]
        D = X.shape[1]+1
        self.list = [i for i in range(len(X))]
        Y = Y.reshape(N, 1)
        data = np.c_[np.ones(len(X)), X] 
        self.rate = np.c_[np.ones(D)] * self.r
        self.mse = np.inf
        self.w = np.c_[np.array([0,0,0,0,0,0,0,0,0])]
        self.sigma = np.c_[np.zeros(D)]
        self.grad = 1
        #if meat the max iterating times or norm > epsilon, stop
        while self.t < self.max_iters and np.linalg.norm(self.grad) > self.e:
            #choose the batch and sample the data(SGD with 1)
            batch_X, batch_Y = self.getData(data, Y)
            self.grad = self.gradient(batch_X, batch_Y)
            #fit the data
            self.fitw(self.grad)
            self.t += 1
            self.mse = self.meanSquareError(X, Y)
            '''if(self.mse<0.2):
                break'''
        print("The Training is finished. MSE=" + str(self.mse) + " trained " + str(self.t) + " times")

    def update(self, v):    #update weight
        self.w = self.w - v

    def simple(self, g):    #simple method
        v = self.rate * g - self.momentum * self.lastV
        self.update(v)
        self.lastV = v

    def adaptive(self, g):  #adaptive method
        v = self.rate / pow(self.t + 1, 0.5) * g - self.momentum * self.lastV
        self.update(v)
        self.lastV = v

    def adagrad(self, g):   #adaptive gradient
        self.sigma = self.sigma + np.square(g)
        v = self.rate / (pow(self.sigma, 0.5) + self.ep) * g - self.momentum * self.lastV
        self.update(v)
        self.lastV = v

    def getData(self, X, Y):    #random and sample data
        if self.batch_size > 0:
            random.shuffle(self.list)
            batch_X = np.array([X[i] for i in range(self.batch_size)])
            batch_Y = np.array([Y[i] for i in range(self.batch_size)])
            return batch_X, batch_Y
        else:
            return X, Y

    def gradient(self, X, Y):   #gradient formula
        return (1 / len(X)) * (X.T @ (X @ self.w - Y))

    #meanSquareError
    def meanSquareError(self, X, Y):
        N = X.shape[0]
        D = X.shape[1]
        Y = Y.reshape(N, 1)
        error = Y - self.predict(X)
        return (error * error).mean()
    #predict the data according to current weight
    def predict(self, X):
        N = X.shape[0]
        D = X.shape[1]
        data = np.c_[np.ones(N), X]
        return data @ self.w

    def predict_first(self, X):
        N = X.shape[0]
        D = X.shape[1]
        data = np.c_[np.ones(N), X]
        return data @ self.firstWeight

    def predict_second(self, X):
        N = X.shape[0]
        D = X.shape[1]
        data = np.c_[np.ones(N), X]
        return data @ self.secondWeight


    def meanSquare(self, X, Y):
        Y1 = Y[:,0]
        Y2 = Y[:,1]
        Y1 = Y1.reshape(X.shape[0],1)
        Y2 = Y2.reshape(X.shape[0],1)
        error1 = Y1-self.predict_first(X)
        error2 = Y2-self.predict_second(X)
        print("The error for Y1 is: " + str((error1*error1).mean()))
        print("The error for Y2 is: " + str((error2*error2).mean()))

#def __init__(self, max_iters, rate=0.01, e=1e-8, batch_size=0, alg=0, m=0)



#Task3
generator1 = TrainingSetGenerator()
generator1.generate(df,0.8)
model1 = LinearRegression_Optimizer(100000,0.6,1e-10, 0, 2, 0.5)
print("The weight and intercept that we have: (The first line is the intercept)")
model1.fit_twice(generator1.data_train_x, generator1.data_train_y)
print("\n")
print("The MSE for the train set: ")
model1.meanSquare(generator1.data_train_x, generator1.data_train_y)
print("\n")
print("The MSE for the test set: ")
model1.meanSquare(generator1.data_test_x, generator1.data_test_y)

model2 = LinearRegression()
model2.fit(generator1.data_train_x, generator1.data_train_y)
print("The weight and intercept that we have: (The first line is the intercept)")
print(model2.w)
print("\n")
print("The MSE for the train set: ")
model2.meanSquare(generator1.data_train_x, generator1.data_train_y)
print("\n")
print("The MSE for the test set: ")
model2.meanSquare(generator1.data_test_x, generator1.data_test_y)

'''for i in  range(100):
    generator1.generate(df,0.8)
    model2.fit(generator1.data_train_x, generator1.data_train_y)
    model2.meanSquare(generator1.data_test_x, generator1.data_test_y)
print(model2.mse1/100)
print(model2.mse2/100)'''




    
    




The weight and intercept that we have: (The first line is the intercept)
The Training is finished. MSE=0.08378697821485745 trained 100000 times
The Training is finished. MSE=0.10430964511803553 trained 99892 times
[[-0.00937362 -0.00587656]
 [-1.01601044 -1.13655294]
 [-0.84569454 -1.00808021]
 [ 0.12115163  0.10308778]
 [-0.85804329 -0.959455  ]
 [ 0.21070412  0.09237515]
 [-0.00269948  0.01381247]
 [ 0.23865714  0.16907907]
 [ 0.03280542  0.01133678]]


The MSE for the train set: 
The error for Y1 is: 0.08378697821485745
The error for Y2 is: 0.10430964511803553


The MSE for the test set: 
The error for Y1 is: 0.12564049768127877
The error for Y2 is: 0.13357783176937665
The weight and intercept that we have: (The first line is the intercept)
[[-0.00928493 -0.00583853]
 [-1.73895181 -1.89097109]
 [-2.13684654 -2.33889435]
 [ 0.45319251  0.44318528]
 [-0.26492249 -0.35532066]
 [ 0.21138859  0.09306888]
 [-0.00269948  0.01381247]
 [ 0.23865714  0.16907907]
 [ 0.03280542  0.01133678]]




'for i in  range(100):\n    generator1.generate(df,0.8)\n    model2.fit(generator1.data_train_x, generator1.data_train_y)\n    model2.meanSquare(generator1.data_test_x, generator1.data_test_y)\nprint(model2.mse1/100)\nprint(model2.mse2/100)'