In [54]:
import pandas as pd

url = "https://raw.githubusercontent.com/frankkn/DeepLearning/master/LinearRegression/Data/Salary_Data2.csv"
data = pd.read_csv(url)
data

Unnamed: 0,YearsExperience,EducationLevel,City,Salary
0,3.0,大學,城市A,45.9
1,7.8,碩士以上,城市C,80.5
2,2.3,高中以下,城市A,25.2
3,5.1,高中以下,城市A,30.4
4,10.0,碩士以上,城市B,65.7
5,1.2,碩士以上,城市C,60.8
6,8.6,大學,城市C,50.1
7,6.9,碩士以上,城市A,70.3
8,4.2,大學,城市A,40.7
9,2.4,高中以下,城市A,28.1


In [55]:
# Data encoding
data['EducationLevel'] = data['EducationLevel'].map({'高中以下':0, '大學':1, '碩士以上':2})

In [56]:
# One-hot encoding
from sklearn.preprocessing import OneHotEncoder

onehot_encoder = OneHotEncoder()
onehot_encoder.fit(data[['City']])
city_encoded = onehot_encoder.transform(data[['City']]).toarray()

In [None]:
data[['CityA', 'CityB', 'CityC']] = city_encoded
data

In [None]:
# Delete column CityC since we can deduct CityC by CityA and CityB
data.drop(['City', 'CityC'], axis = 1) # axis = 1 means deleting column

In [59]:
# Divide data into train set and test set
from sklearn.model_selection import train_test_split

x = data[['YearsExperience', 'EducationLevel', 'CityA', 'CityB']]
y = data['Salary']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=86) # 80% of data for training and 20% for testing
x_train = x_train.to_numpy()
x_test = x_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [60]:
# Feature scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x_train) # calculate x_train's mean and standard deviation
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [61]:
# y = w1*x1 + w2*x2 + w3*x3 + ... + b
# Salary = w1*YearsExperience + w2*EducationLevel + w3*City + b
# Salary = w1*YearsExperience + w2*EducationLevel + w3*CityA + w4*CityB + b
import numpy as np

w = np.array([1, 2, 3, 4])
b = 0
y_pred = (x_train*w).sum(axis=1) + b # each element = w1*x1 + w2*x2 + w3*x3 + w4*x4 + b

In [62]:
# Cost function using MSE(Minimum Squared Error)
def compute_cost(x, y, w, b):
  y_pred = (x*w).sum(axis=1)+b
  cost = ((y - y_pred) ** 2).mean()
  return cost

In [63]:
y_pred = (x_train*w).sum(axis=1)+b

# (y - w1*x1 + w2*x2 + w3*x3 + w4*x4 + b) ** 2
# slope of w1 = 2 * x1 * (y_pred - y)
# slope of b = 2 * (y_pred - y)

b_gradient = (y_pred - y_train).mean()
# w1_gradient = (x_train[:, 0]*(y_pred - y_train)).mean() # x1 * (y_pred - y_train)
# w2_gradient = (x_train[:, 1]*(y_pred - y_train)).mean()
# w3_gradient = (x_train[:, 2]*(y_pred - y_train)).mean()
# w4_gradient = (x_train[:, 3]*(y_pred - y_train)).mean()

# x_train.shape = (28, 4) which means 28 rows and 4 columns
# So we use x_train.shape[1] to indicate how many weights do we have
w_gradient = np.zeros(x_train.shape[1]) # create [0, 0, 0, 0]

for i in range(x_train.shape[1]):
  w_gradient[i] = (x_train[:, i] * (y_pred - y_train)).mean()

w_gradient


array([-5.62e+00, -1.43e+01,  1.19e+01, -1.09e+00])

In [64]:
def compute_gradient(x, y, w, b):
  y_pred = y_pred = (x*w).sum(axis=1)+b
  w_gradient = np.zeros(x.shape[1])
  b_gradient = (y_pred - y).mean()
  for i in range(x.shape[1]):
    w_gradient[i] = (x[:, i]*(y_pred - y)).mean()

  return w_gradient, b_gradient

In [65]:
w = np.array([1, 2, 2, 4])
b = 1
compute_gradient(x_train, y_train, w, b)

(array([-5.19e+00, -1.36e+01,  1.09e+01, -7.20e-01]), -49.324999999999996)

In [66]:
w = np.array([1, 2, 2, 4])
b = 1
learning_rate = 0.001

print(compute_cost(x_train, y_train, w, b))

w_gradient, b_gradient = compute_gradient(x_train, y_train, w, b)

w = w - w_gradient*learning_rate
b = b - b_gradient*learning_rate

print(compute_cost(x_train, y_train, w, b))

2678.49833787131
2672.972557100487


In [67]:
np.set_printoptions(formatter={'float':'{: .2e}'.format})
def gradient_descent(x, y, w_init, b_init, learning_rate, cost_function, gradient_function, run_iter, print_iter=1000):
  w = w_init
  b = b_init

  w_history = []
  b_history = []
  c_history = []

  for i in range(run_iter):
    w_gradient, b_gradient = gradient_function(x, y, w, b)

    w = w - w_gradient * learning_rate
    b = b - b_gradient * learning_rate
    cost = cost_function(x, y, w, b)

    w_history.append(w)
    b_history.append(b)
    c_history.append(cost)

    if i % print_iter == 0:
      print(f'Iteration {i:5}: Cost {cost:.4e}, w {w}, b {b:.2e}, w_gradient {w_gradient}, b_gradient {b_gradient: .2e}')
  return w, b, w_history, b_history, c_history

In [68]:
w_init = np.array([1, 2, 3, 4])
b_init = 1
learning_rate = 1.0e-3
run_iter = 50000

w_final, b_final, w_history, b_history, c_history = gradient_descent(x_train, y_train, w_init, b_init, learning_rate, compute_cost, compute_gradient, run_iter)

Iteration     0: Cost 2.6957e+03, w [ 1.01e+00  2.01e+00  2.99e+00  4.00e+00], b 1.05e+00, w_gradient [-5.62e+00 -1.43e+01  1.19e+01 -1.09e+00], b_gradient -4.93e+01
Iteration  1000: Cost 3.8274e+02, w [ 2.82e+00  9.30e+00 -2.17e+00  2.13e+00], b 3.22e+01, w_gradient [-3.46e-01 -3.52e+00  1.68e+00  2.65e+00], b_gradient -1.81e+01
Iteration  2000: Cost 7.6650e+01, w [ 3.11e+00  1.16e+01 -2.87e+00 -6.45e-02], b 4.37e+01, w_gradient [-3.47e-01 -1.45e+00  1.25e-01  1.70e+00], b_gradient -6.67e+00
Iteration  3000: Cost 3.2101e+01, w [ 3.51e+00  1.26e+01 -2.79e+00 -1.37e+00], b 4.79e+01, w_gradient [-4.26e-01 -7.75e-01 -2.07e-01  9.80e-01], b_gradient -2.45e+00
Iteration  4000: Cost 2.4505e+01, w [ 3.92e+00  1.32e+01 -2.54e+00 -2.13e+00], b 4.94e+01, w_gradient [-3.86e-01 -4.73e-01 -2.65e-01  5.89e-01], b_gradient -9.02e-01
Iteration  5000: Cost 2.2675e+01, w [ 4.27e+00  1.36e+01 -2.28e+00 -2.60e+00], b 5.00e+01, w_gradient [-3.08e-01 -3.16e-01 -2.45e-01  3.71e-01], b_gradient -3.32e-01
Iter

In [69]:
w_final, b_final

(array([ 5.26e+00,  1.46e+01, -1.28e+00, -3.56e+00]), 50.32499999999646)

In [70]:
y_pred = (w_final*x_test).sum(axis=1) + b_final
pd.DataFrame({
  'y_pred': y_pred,
  'y_test': y_test
})

Unnamed: 0,y_pred,y_test
0,29.012894,31.6
1,45.50288,36.7
2,67.887887,72.7
3,65.600001,63.6
4,68.648535,70.3
5,55.477622,48.3
6,44.742232,48.3
7,72.918418,80.5


In [71]:
compute_cost(x_test, y_test, w_final, b_final)

29.465493263772487

In [72]:
# 年資：5.3年 學歷：碩士以上 城市A
# 年資：7.2年 學歷：高中以下 城市B
x_real = np.array([[5.3, 2, 1, 0], [7.2, 0, 0, 1]])
x_real = scaler.transform(x_real)
y_real = (w_final*x_real).sum(axis=1) + b_final
y_real

array([ 6.56e+01,  2.39e+01])