In [18]:
import numpy as np
import pandas as pd

class LinearRegression:
    def __init__(self,learning_rate=0.01,n_iterations=1000):
        self.weights = None
        self.bias = None
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
    def fit(self,X,y):
        """
        Huấn luyện mô hình Linear Regression
        """
        n_samples,n_features = X.shape
        # Khởi tạo weights và bias
        self.weights = np.zeros(n_features)
        self.bias = 0
        # Gradient Descent
        for _ in range(self.n_iterations):
            # Dự đoán
            y_predicted = np.dot(X,self.weights)+self.bias
            # Tính gradient
            dw = (1/n_samples)*np.dot(X.T,(y_predicted-y))
            db = (1/n_samples)*np.sum(y_predicted-y)
            # Cập nhật weights và bias
            self.weights -= self.learning_rate*dw
            self.bias -= self.learning_rate*db
    def predict(self,X):
        """
        Dự đoán kết quả từ mô hình
        """
        return np.dot(X, self.weights)+self.bias

    def r2_score(self,y_true,y_pred):
        """
        Tính toán R² score để đánh giá mô hình
        """
        ss_total = np.sum((y_true-np.mean(y_true))**2)
        ss_residual = np.sum((y_true-y_pred)**2)
        return 1-(ss_residual/ss_total)
    def mean_square_error(self,y_true,y_pred):
        """
        Tính toán Mean Square Error (MSE)
        """
        mse = np.mean((y_true-y_pred)**2)
        return mse
    
# Example
X_train = np.array([[1],[2],[3],[4],[5]])
y_train = np.array([3,6,9,12,15])  # Tend to y=3x
# Create and train the model
model = LinearRegression()
model.fit(X_train,y_train)
# Predict
X_test = np.array([[6],[7]])
prediction = model.predict(X_test)
print("Prediction: ",prediction)

Prediction:  [17.90592373 20.86655605]


# Sample dataset

In [19]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [20]:
diabetes = load_diabetes()

In [21]:
X = pd.DataFrame(diabetes.data,columns=diabetes.feature_names)
X

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930


In [22]:
y = pd.Series(diabetes.target,name='Disease_Progression')
y

0      151.0
1       75.0
2      141.0
3      206.0
4      135.0
       ...  
437    178.0
438    104.0
439    132.0
440    220.0
441     57.0
Name: Disease_Progression, Length: 442, dtype: float64

In [23]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [24]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [25]:
model = LinearRegression()
model.fit(X_train,y_train)

## 1.Using math formula

In [26]:
coefficients = np.dot(
    np.linalg.inv(np.dot(X_train.T,X_train)),  
    np.dot(X_train.T,y_train)                      
)

coefficients

array([  1.75375799, -11.51180908,  25.60712144,  16.82887167,
       -44.44885564,  24.64095356,   7.67697768,  13.1387839 ,
        35.16119521,   2.35136365])

In [27]:
# 1. Thêm cột toàn số 1 vào trước ma trận X để tạo X*
X_star = np.hstack([np.ones((X_train.shape[0], 1)), X_train])

# 2. Tính hệ số w bằng công thức bình phương tối thiểu
w = np.dot(
    np.linalg.inv(np.dot(X_star.T, X_star)),
    np.dot(X_star.T, y_train)
)
w[0]

153.73654390934846

In [28]:
a = np.dot(X_test,coefficients)+w[0]
a

array([139.5475584 , 179.51720835, 134.03875572, 291.41702925,
       123.78965872,  92.1723465 , 258.23238899, 181.33732057,
        90.22411311, 108.63375858,  94.13865744, 168.43486358,
        53.5047888 , 206.63081659, 100.12925869, 130.66657085,
       219.53071499, 250.7803234 , 196.3688346 , 218.57511815,
       207.35050182,  88.48340941,  70.43285917, 188.95914235,
       154.8868162 , 159.36170122, 188.31263363, 180.39094033,
        47.99046561, 108.97453871, 174.77897633,  86.36406656,
       132.95761215, 184.53819483, 173.83220911, 190.35858492,
       124.4156176 , 119.65110656, 147.95168682,  59.05405241,
        71.62331856, 107.68284704, 165.45365458, 155.00975931,
       171.04799096,  61.45761356,  71.66672581, 114.96732206,
        51.57975523, 167.57599528, 152.52291955,  62.95568515,
       103.49741722, 109.20751489, 175.64118426, 154.60296242,
        94.41704366, 210.74209145, 120.2566205 ,  77.61585399,
       187.93203995, 206.49337474, 140.63167076, 105.59

In [29]:
b = np.mean((y_test-a)**2)
b

2900.193628493485

## 2. Using created model

In [30]:
import numpy as np
import pandas as pd

class LinearRegression:
    def __init__(self,learning_rate=0.01,n_iterations=1000):
        self.weights = None
        self.bias = None
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
    def fit(self,X,y):
        """
        Huấn luyện mô hình Linear Regression
        """
        n_samples,n_features = X.shape
        # Khởi tạo weights và bias
        self.weights = np.zeros(n_features)
        self.bias = 0
        # Gradient Descent
        for _ in range(self.n_iterations):
            # Dự đoán
            y_predicted = np.dot(X,self.weights)+self.bias
            # Tính gradient
            dw = (1/n_samples)*np.dot(X.T,(y_predicted-y))
            db = (1/n_samples)*np.sum(y_predicted-y)
            # Cập nhật weights và bias
            self.weights -= self.learning_rate*dw
            self.bias -= self.learning_rate*db
    def predict(self,X):
        """
        Dự đoán kết quả từ mô hình
        """
        return np.dot(X, self.weights)+self.bias

    def r2_score(self,y_true,y_pred):
        """
        Tính toán R² score để đánh giá mô hình
        """
        ss_total = np.sum((y_true-np.mean(y_true))**2)
        ss_residual = np.sum((y_true-y_pred)**2)
        return 1-(ss_residual/ss_total)
    def mean_square_error(self,y_true,y_pred):
        """
        Tính toán Mean Square Error (MSE)
        """
        mse = np.mean((y_true-y_pred)**2)
        return mse

In [31]:
model = LinearRegression()
model.fit(X_train,y_train)

In [32]:
y_pred = model.predict(X_test)
y_pred

array([140.59594652, 181.72981353, 140.37433773, 294.53366265,
       120.9699172 ,  93.37387162, 257.40959312, 187.76433744,
        83.26158462, 110.98837896,  95.01532288, 162.58656935,
        64.27209732, 205.3828704 ,  98.51637925, 132.38809734,
       222.06053411, 246.64221053, 196.48170611, 214.54010681,
       207.79703894,  88.63368097,  71.9152959 , 188.47582468,
       156.70222589, 162.32691955, 190.08711361, 176.80186051,
        49.74325303, 110.455786  , 180.90737179,  91.59381123,
       131.34580463, 180.48511897, 173.15055663, 191.58257853,
       122.12085008, 117.53576308, 145.02444721,  60.77748593,
        74.41935015, 107.5999578 , 161.91418   , 148.65930079,
       175.74429756,  65.52339821,  78.78466591, 106.42059057,
        58.62465794, 161.21339277, 157.69980564,  65.6188605 ,
       114.09592458, 107.89017385, 169.4617811 , 160.7117092 ,
        94.16701611, 208.01838185, 118.04506841,  68.02523437,
       184.67675217, 202.8581617 , 141.78846551, 104.83

In [33]:
r2 = model.r2_score(y_test,y_pred)
r2

0.45548505593573707

In [34]:
mse = model.mean_square_error(y_test,y_pred)
mse

2884.922802987494