In [1]:
import time

import numpy as np
import pandas as pd
import sklearn.linear_model
import matplotlib.pyplot as plt

import sklearn.linear_model
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [10]:
class LinearRegression():
    """
    LSS-based linear regression model.
    """
    def __init__(self, method='analytical', 
                       learning_rate=0.01, 
                       max_epochs=100, 
                       early_stopping_threshold=0.0003, 
                       iters_to_stop=5,
                       regularization='none',
                       C=1.):
        """
        Initalizes new LinearResression model instance.
        -----------
        Parameters:
        -----------
            method: {'analytical', 'gd'}, default='analytical'
                Algorithm that is selected to calculate linear model
                weights and bias.
                'analytical' - explicit formula (A^T*A)^-1*A^T*y
                'gd' - gradient descent
            learning_rate: float, default=0.01
                If method='gd', then defines learning rate
                for gradient descent, else ignored
            max_epochs: int, default=100
                For method='gd' defines upper boundary for
                how many iterations will be done.
            early_stopping_threshold: float, default=0.0003
                For method='gd' defines what loss function value
                difference between epoch and epoch+1 is considered as
                idle iteration and convergence (criteria to stop).
            iters_to_stop: int, default=5
                For method='gd' defines how many idle iterations in a row
                is required to stop further calculations.
            regularization: {'none', 'l2', 'l1'}, default='none'
                Type of regularization used in linear model.
                'l1' works only for method='gd'
            C: float, default=1.0
                Regularization parameter. Ignored, if
                regularization=none
        """
        self.method = method
        self.learning_rate = learning_rate
        self.regularization = regularization
        self.max_epochs = max_epochs
        self.early_stopping_threshold = early_stopping_threshold
        self.iters_to_stop = iters_to_stop
        self.c = C
        pass

    def fit(self, X, y):
        """
        Fits X to y: calculates weights to minimize
        MSE related to y.
        Returns self
        """

        if self.regularization == 'l1' and self.method != 'gd':
            print('Warning! Regularization parameter is set to \'l1\' whilst method is not \'gd\'! Using method=\'gd\'...')
            self.method = 'gd'
        
        if self.method == 'gd':
            self.coef_ = np.zeros(X.shape[1])
            self.bias = 0
            a_ddx = np.zeros(X.shape[1])
            b_ddx = 0

            self.prev_error = 0
            self.useless_iterations = 0

            for ep in range(self.max_epochs):
                y_pred = X @ self.coef_ + self.bias
                error = y - y_pred.ravel()
                # early stopping
                if np.abs(error.mean()-self.prev_error) < self.early_stopping_threshold:
                    if self.useless_iterations >= self.iters_to_stop:
                        return self
                    self.useless_iterations += 1
                else:
                    self.useless_iterations = 0
                self.prev_error_mean = error.mean()
                
                for i in range(X.shape[1]):
                    if self.regularization == 'l2':
                        a_ddx[i] = -2 * (X[:,i] * error).mean() + 2*self.c*self.coef_[i]/len(X[:,i])
                    if self.regularization == 'l1':
                        a_ddx[i] = -2 * (X[:,i] * error).mean() + np.sign(self.coef_[i])*self.c/len(X[:,i])
                    if self.regularization == 'none':
                        a_ddx[i] = -2 * (X[:,i] * error).mean()
                b_ddx = -2 * error.mean()
                     
                for i in range(X.shape[1]):
                    self.coef_[i] -= self.learning_rate * a_ddx[i]
                self.bias -= self.learning_rate * b_ddx
            return self
        if self.method == 'analytical':
            ones = np.ones(shape=(X.shape[0], 1))
            A = np.concatenate([ones, X], axis=1)
            if self.regularization == 'none':
                self.weights = np.linalg.inv(A.T @ A) @ A.T @ y
                self.coef_ = self.weights[1:]
                self.bias = self.weights[0]
            if self.regularization == 'l2':
                self.weights = np.linalg.inv(A.T @ A + self.c*np.eye(A.shape[1])) @ A.T @ y
                self.coef_ = self.weights[1:]
                self.bias = self.weights[0]            
        return self

    def predict(self, X):
        """
        Predicts and returns outputs for X using formula:
            ---------------
            y_pred = Xw + b
            ---------------
            X - input data, 2-dimensional array 
            (samples x features).
            w, b - vector of weights and bias scalar
            calculated at fit() step.
        """
        return X @ self.coef_ + self.bias

In [2]:
data = pd.read_csv('iris.data')
ohe = preprocessing.OneHotEncoder(sparse=False)
ohe_iris_type = pd.get_dummies(data[['Species']])
#df = data.iloc[:,:-1]
df = pd.concat([data.iloc[:,:-1], ohe_iris_type.iloc[:,:-1]], axis=1)

In [3]:
ohe_iris_type.sum(axis=1).unique()

array([1], dtype=int64)

In [4]:
df
X, y = df.loc[:,df.columns!='Sepal-width'], df['Sepal-width']

In [5]:
df

Unnamed: 0,Sepal-length,Sepal-width,Petal-length,Petal-width,Species_Iris-setosa,Species_Iris-versicolor
0,5.1,3.5,1.4,0.2,1,0
1,4.9,3.0,1.4,0.2,1,0
2,4.7,3.2,1.3,0.2,1,0
3,4.6,3.1,1.5,0.2,1,0
4,5.0,3.6,1.4,0.2,1,0
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,0,0
146,6.3,2.5,5.0,1.9,0,0
147,6.5,3.0,5.2,2.0,0,0
148,6.2,3.4,5.4,2.3,0,0


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

linear = LinearRegression(regularization='l1')
linear.fit(X_train.values, y_train.values)
y_pred = linear.predict(X_test.values)
mean_squared_error(y_test, y_pred)



0.08259287873485013

In [29]:
X_train.shape

(120, 5)

In [17]:
nfolds_list = [5]
regularizations = ['none', 'l2', 'l1']
creators = ['me', 'sklearn']
methods = ['analytical', 'gd']

stats = pd.DataFrame(columns=[
    'creator', 'method', 'nfolds', 'regularization', 'execution_time', 'weights', 'error'
    ])

skip_iteration = False

for nfolds in nfolds_list:
    for method in methods:
        for creator in creators:
            for regularization in regularizations:
                exec_time = 0
                error = 0
                weights = np.zeros(X.shape[1])   
                for i in range(nfolds):
                    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
                    start_time = time.time()
                    if creator == 'me':
                        if regularization == 'l1' and method == 'analytical':
                            skip_iteration = True
                            break
                        linear = LinearRegression(regularization=regularization, method=method)
                    if creator == 'sklearn':
                        if method == 'analytical':
                            if regularization == 'l2':
                                linear = sklearn.linear_model.Ridge()
                            if regularization == 'l1':
                                skip_iteration = True
                                break
                            if regularization == 'none':
                                linear = sklearn.linear_model.LinearRegression()
                        if method == 'gd':
                            if regularization == 'none':
                                skip_iteration = True
                                break
                            linear = sklearn.linear_model.SGDRegressor(penalty=regularization)
                    linear.fit(X_train.values, y_train.values)
                    y_pred = linear.predict(X_test.values)
                    error += mean_squared_error(y_test, y_pred)
                    weights += linear.coef_
                        
                    exec_time += time.time() - start_time

                if skip_iteration:
                    skip_iteration = False
                    continue

                exec_time /= nfolds
                error /= nfolds
                weights /= nfolds

                new_model_stats_row = pd.Series({
                                    'nfolds': nfolds, 
                                    'weights': weights, 
                                    'execution_time': exec_time,
                                    'error': error,
                                    'creator': creator,
                                    'regularization': regularization,
                                    'method': method
                                    })
                stats = stats.append(new_model_stats_row, ignore_index=True)

TypeError: __init__() got an unexpected keyword argument 'regularization'

In [34]:
stats[stats['creator'] == 'sklearn']

Unnamed: 0,creator,method,nfolds,regularization,execution_time,weights,error
2,sklearn,analytical,5,none,0.003713,"[0.3694393061278974, -0.1620296732018009, 0.70...",0.087943
3,sklearn,analytical,5,l2,0.003828,"[0.4331726360595021, -0.2821255289525397, 0.44...",0.077549
7,sklearn,gd,5,l2,0.002121,"[0.6492918149973905, -0.2406796272482327, -0.0...",0.088646
8,sklearn,gd,5,l1,0.001396,"[0.645751428202414, -0.24527249330012763, -0.0...",0.083652


In [None]:
# по фолдам: считаем время, веса, ошибки, параметр алгоритма задачи
# sql
# numpy
# + регуляризация
# 

In [None]:
# ml
# создать репозиторий, оформить в виде класса регрессию
# сделать норм таблицы шоб понятно было + анализ (быстрее, наверное потому что ...)
# --------------------------------
# python лутс глава 1 читать
# --------------------------------
# numpy матрицу доделываю
# ----------------------------