In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression

import random
import math

In [2]:
df = pd.read_csv('housing.csv', header=None, sep='\s+')  # sep='\s+' or  sep='\t' or
column_names = [
    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 
    'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'
]


df.columns = column_names
df.head()

  df = pd.read_csv('housing.csv', header=None, sep='\s+')  # sep='\s+' or  sep='\t' or


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [3]:
X = df.drop(df.columns[-1], axis=1).to_numpy()
y = df['MEDV'].to_numpy()

In [4]:
test_size = 0.25

row_count = X.shape[0] 
row_count_in_samples = int(row_count * test_size)

indixes = np.random.permutation(row_count)

test_indices = indixes[:row_count_in_samples]
train_indices = indixes[row_count_in_samples:]


X_train, X_test = X[train_indices], X[test_indices]
y_train, y_test = y[train_indices], y[test_indices]

Linear Regression 

In [None]:
def linear_regression(
        X: np.ndarray,
        y: np.ndarray,
        learning_rate: float = 0.01,
        iterations: int = 5000,
        coef_umensheniya_learning_rate: float = 0.001,
        epsilon: float = 1e-4
    ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    Линейная регрессия.

    :param X: Исходная матрица.
    :param y: Целевые значения.
    :param learning_rate: Начальная скорость.
    :param iterations: максимальное количество итераций.
    :param coef_umensheniya_learning_rate: Коэффициент уменьшения скорости сходимости.
    :param epsilon: показатель качества модели для досрочного выхода из обучения.
    :return: Кортеж: 
        - weights: Веса.
        - X_mean: Средние значения.
        - X_std: Стандартные отклонения.
    """
    # нормализация признаков
    X_mean = np.mean(X, axis=0 )
    X_std = np.std(X, axis=0)
    # Избегаем деления на ноль
    X_std[X_std == 0] = 1
    X_normalized = (X - X_mean) / X_std
    
    # Добавление столбца единиц для свободного члена
    row, col = X_normalized.shape
    X_b = np.column_stack([np.ones(row), X_normalized])

    weights = np.random.randn(col + 1)

    for i in range(iterations):
        y_pred = X_b @ weights
        error = np.abs(y_pred - y)

        gradient = (2 / row) * X_b.T @ error
        current_lr = learning_rate * (1 / (1 + coef_umensheniya_learning_rate * i))
        old_weights = weights.copy()
        weights = weights - current_lr * gradient

        # Проверяем изменение весов
        weight_change = np.linalg.norm(weights - old_weights)
        
        if weight_change < epsilon:
            print(f"Сходимость достигнута на итерации {i}, изменение весов: {weight_change:.2e}")
            break

        if i == iterations - 1:
            print(f"Заданная точность НЕ достигнута за {iterations} шагов, последнее изменение весов: {weight_change:.2e}")
    
    return weights, X_mean, X_std

def predict(X, weights, X_mean, X_std):
    """
    Предсказание.
    
    :param X: Исходная матрица.
    :param weights: Веса.
    :param X_mean:Средние значения.
    :param X_std: Стандартные отклонения.
    :return: Предсказание.
    """
    X_normalized = (X - X_mean) / X_std
    row = X_normalized.shape[0]
    X_b = np.column_stack([np.ones(row), X_normalized])
    return X_b @ weights

In [6]:
weights, X_mean, X_std = linear_regression(X_train, y_train)


y_predict = predict(X_test, weights, X_mean, X_std)

Заданная точность НЕ достигнута за 5000 шагов, последнее изменение весов: 1.29e-04


sklearn

In [7]:
regression = LinearRegression()
regression.fit(X_train, y_train)
y_predict_sk = regression.predict(X_test)

Результаты

In [8]:
for i in range(len(y_predict)):
    print(f'{y_predict[i]}\t{y_predict_sk[i]}\t{y_test[i]}')

27.21655503776577	27.35480121638878	27.1
28.768746281642137	28.931877985406356	24.4
29.47562406175304	29.73086880692628	25.0
20.83105366820226	20.769938759156673	21.0
34.70504504273799	34.73039522213752	26.7
10.17734857994884	10.198041887963566	8.3
3.9003385320343753	3.941482337747253	8.8
27.36234155489336	27.54041467847354	22.3
26.775237689000505	26.774264393658434	23.8
14.90813480170166	14.91426446831423	9.6
27.691102079233822	27.777381410827406	24.5
22.910738629825534	22.910113328674512	17.8
17.368765404358488	17.22593574324608	18.1
20.719794050945797	20.732618690767453	21.7
17.64503165622188	17.683562707392564	14.9
28.001521269552246	27.974540330243453	23.9
25.684299054868916	25.80612276617125	23.9
22.54296652243505	22.41097867959125	20.3
18.046939777404447	18.07779130985162	19.3
21.50393429179416	21.454874156856743	17.8
19.17347466111868	19.229520638154305	20.0
20.919491561008485	20.69019374893463	18.7
27.170348889183206	27.210044150798762	23.9
23.30775030433106	23.348227809783637