# Class definition

In [1]:
class LinearRegression:
    """
    OLS Linear Regression.

    LinearRegression fits a linear model with k regressors, with fitted coefficients 
    b = (b1, ..., bk) to minimize the residual sum of squares between the observed 
    target variable in the dataset, and the target predicted by the linear approximation.
    
    Needed packages:
    import numpy as np
    from scipy import stats

    Parameters
    ----------
    fit_intercept : bool, default=True
        Whether to calculate the intercept for this model. If set to False, 
        no intercept will be used in calculations (e.g. use if data is centered).
        
    Attributes
    ----------
    coefficients : array of shape k+1: #features/regressors + 1
        Estimated coefficients for the linear regression problem.
        This includes the intercept, as first value in the array.
        
    intercept : array of shape 1
    Independent term/constant in the linear model. Set to 0.0 if fit_intercept = False.
    
    residuals : array of shape N
        Estimated residuals, defined as the difference between the predicted,
        and the true y-value
        
    se_coefficients : array of shape k+1
        Estimated standard errors of estimated regression coefficients.
    
    t_values : array of shape k+1
        Estimated t-values of estimated regression coefficients, for H0: b=0.
    
    p_values : array of shape k+1
        Estimated p-values of estimated regression coefficients, for H0: b=0.
        
    residualmakermatrix : array of shape (N, N)
        Matrix M is called the residual maker matrix since it makes residuals out of y.
        
    hatmatrix : array of shape (N, N)
        Matrix H is called the hat matrix since it makes the fitted y out of y.

    n_features : int
        Number of features seen during method `fit`, excluding intercept.
        

        
    Methods
    ----------
    fit(X, y): 
        Fit linear model, and create attributes
    
    predict(X): 
        Predict target variable, using the fitted parameters of this estimator.
    
    R_squared(X,y): 
        Return the coefficient of determination of the prediction.
    
    adjusted_R_squared(X,y): 
        Return the adjusted coefficient of determination of the prediction.
    
    summary(decimals): 
        Print summary of regression output after fit.
    

    Examples
    --------
    >>> import numpy as np
    >>> X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
    >>> y = np.array([[0], [4], [0], [6]])
    >>> regfit = LinearRegression(fit_intercept=True).fit(X, y)
    >>> regfit.summary()
    Regression output:
               Coefficient  S.E.       t-value    p-value   
    Intercept  -1.5         1.658      -0.905     0.532     
    X1         -4.0         1.414      -2.828     0.216     
    X2         5.0          1.0        5.0        0.126     

    Residuals:
    [-0.5  0.5  0.5 -0.5]

    R-squared: 0.963               	 Adjusted R-squared: 0.889
    
    """
    def __init__(
        self, 
        fit_intercept=True,
    ):
        self.fit_intercept = fit_intercept
        

    def fit(self, X, y):
        """
        Fit linear regression model, simple OLS.

        Parameters
        ----------
        X : array of shape Nxk (Training data)
        y : array of shape N (Target values)

        Returns
        -------
        self : object
            Fitted Estimator.
        """
        # Define n_features
        self.n_features = len(X[0])
        
        # Define coefficients
        if self.fit_intercept == True:
            X = np.append(np.ones((len(X),1)), X, axis=1)
            self.coefficients = (np.linalg.inv(X.T@X)@np.transpose(X)@y).T
            self.intercept = self.coefficients[:,0]
        else:
            self.coefficients = (np.linalg.inv(X.T@X)@np.transpose(X)@y).T
            self.intercept = 0.0
            
        # Define residuals
        y_predicted = self.coefficients @ X.T
        self.residuals = y_predicted.T - y
        
        # Define standard error of coefficients
        self.var_hat = np.sum(self.residuals**2)
        self.se_coefficients = self.var_hat * np.sqrt(np.diagonal(np.linalg.inv(X.T@X)))
        
        # Define t- and p-values for Null hypothesis H0: b=0
        self.t_values = self.coefficients / self.se_coefficients
        if self.fit_intercept == True:
            df = len(X) - len(X[0])
        else:
            df = len(X) - len(X[0]) - 1
        self.p_values = 2*(1 - stats.t.cdf(abs(self.t_values), df))

        # Define H,M matrices
        self.residualmakermatrix = np.identity(len(X)) - X @ np.linalg.inv(X.T@X) @ X.T
        self.hatmatrix = X @ np.linalg.inv(X.T@X) @ X.T
        
        return self
    
    def predict(self, X):
        """
        Predict using the fitted parameters of this Linear Regression estimator.

        Parameters
        ----------
        X : array of shape M, k-1
            M values for k-1 regressors test data. Don't add 1 for the intercept.

        Returns
        -------
        C : array of shape M
            Returns predicted value.
        """
        if hasattr(self, 'coefficients') == False:
                raise ValueError(
                     " This LinearRegression instance is not fitted yet." \
                     " Call 'fit' method with appropriate X and y before using this predict function."
                 )
                
        if self.fit_intercept == True:
            X = np.vstack([np.ones(len(X)), X.T]).T
        
        y_predicted = self.coefficients @ X.T

        return y_predicted.T
    
    def R_squared(self, X, y):
        """Return the coefficient of determination of the prediction.

        The coefficient of determination R^2 is defined as the residual
        sum of squares divided by the total sum of squares. The best possible R^2
        score is 1.0, explaining all variance in the data. The R^2 score can be negative
        when no constant is included, or the model is made arbitrarily bad. A model 
        always predicting the mean of y, the expected value of y, disregarding the input 
        features, gets a R^2 score of 0.0.

        Parameters
        ----------
        X : array of shape Nxk
        y : array of shape N


        Returns
        -------
        R_squared : float
            R^2 score of predicted X wrt. true y
        """
        
        if hasattr(self, 'coefficients') == False:
                raise ValueError(
                     " This LinearRegression instance is not fitted yet." \
                     " Call 'fit' method with appropriate X and y before using this score function."
                 )
        
        SSR = sum((y - self.predict(X))**2)
        SST = sum((y - y.mean())**2)
        R_squared = (1 - SSR/SST)[0]
        
        return R_squared
    
    def adjusted_R_squared(self, X, y):
        """Return the adjusted coefficient of determination of the prediction.

        The adjusted coefficient of determination R^2 is defined as the residual
        sum of squares divided by N-k, divided by the total sum of squares, divided by N-1. 
        This allows penalty for #regressors.

        Parameters
        ----------
        X : array of shape Nxk
        y : array of shape N


        Returns
        -------
        adjusted_R_squared : float
            Adjusted R^2 score of predicted X wrt. true y
        """
        
        if hasattr(self, 'coefficients') == False:
                raise ValueError(
                     " This LinearRegression instance is not fitted yet." \
                     " Call 'fit' method with appropriate X and y before using this score function."
                 )
        
        N = len(X)
        k = len(X[0])
        
        R2 = self.R_squared(X,y)
        adjusted_R_squared = 1 - ((N-1)/(N-k-1))* (1 - R2)
        
        return adjusted_R_squared
    
    
    def summary(self, decimals=3):
        """Returns the table of the regression output.

        Parameters
        ----------
        decimals : int, default=3
            Number of decimals to round to in the table.


        Returns
        -------
        regression_table : table
            Table of the regression output.
        """
        
        if hasattr(self, 'coefficients') == False:
                raise ValueError(
                     " This LinearRegression instance is not fitted yet." \
                     " Call 'fit' method with appropriate X and y before using this predict function."
                 )

        print('Regression output:')
        table = np.concatenate((regfit.coefficients, 
                        regfit.se_coefficients.reshape(1,len(regfit.coefficients[0])),
                        regfit.t_values, 
                        regfit.p_values), axis=0).T

        regression_dict = dict()
        for num in range(len(regfit.coefficients[0])):
            if num == 0:
                if self.fit_intercept == True:
                    regression_dict['Intercept'] = table[num]
                else:
                    regression_dict['X1'] = table[num]
            else:
                if self.fit_intercept == True:
                    string = f"X{num}"
                    regression_dict[string] = table[num]
                else:
                    string = f"X{num+1}"
                    regression_dict[string] = table[num]
        
        if decimals < 4:
            L = 10
        else:
            L = decimals + 7
                
        print("{:<{L}} {:<{L2}} {:<{L}} {:<{L}} {:<{L}}".format('', 'Coefficient', 'S.E.', 't-value', 'p-value', 
                                                                L=L, L2=L+2))
        for k, v in regression_dict.items():
            estim, se, tval, pval = v
            print("{:<{L}} {:<{L2}} {:<{L}} {:<{L}} {:<{L}}".format(k, 
                                                              round(estim, decimals), 
                                                              round(se, decimals), 
                                                              round(tval, decimals),
                                                              round(pval,decimals), L=L, L2=L+2))

        print('\nResiduals:')
        print(regfit.residuals.T[0])

        print(f'\nR-squared: {round(regfit.R_squared(X,y), decimals)} \
              \t Adjusted R-squared: {round(regfit.adjusted_R_squared(X,y), decimals)}')

## Examples

In [2]:
import numpy as np
from scipy import stats
X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
y = np.array([[0], [4], [0], [6]])

regfit = LinearRegression(fit_intercept=True).fit(X, y)
regfit.coefficients

array([[-1.5, -4. ,  5. ]])

In [3]:
regfit.intercept

array([-1.5])

In [4]:
regfit.residuals

array([[-0.5],
       [ 0.5],
       [ 0.5],
       [-0.5]])

In [5]:
regfit.se_coefficients

array([1.6583124 , 1.41421356, 1.        ])

In [6]:
regfit.t_values

array([[-0.90453403, -2.82842712,  5.        ]])

In [7]:
regfit.p_values

array([[0.53188428, 0.2163469 , 0.12566592]])

In [8]:
# regfit.residuals == regfit.residualmakermatrix @ y
regfit.residualmakermatrix

array([[ 0.25, -0.25, -0.25,  0.25],
       [-0.25,  0.25,  0.25, -0.25],
       [-0.25,  0.25,  0.25, -0.25],
       [ 0.25, -0.25, -0.25,  0.25]])

In [9]:
# regfit.predict(X) == regfit.hatmatrix @ y
regfit.hatmatrix

array([[ 0.75,  0.25,  0.25, -0.25],
       [ 0.25,  0.75, -0.25,  0.25],
       [ 0.25, -0.25,  0.75,  0.25],
       [-0.25,  0.25,  0.25,  0.75]])

In [10]:
regfit.n_features

2

In [11]:
regfit.predict(X)

array([[-0.5],
       [ 4.5],
       [ 0.5],
       [ 5.5]])

In [12]:
regfit.R_squared(X, y)

0.962962962962963

In [13]:
regfit.adjusted_R_squared(X,y)

0.8888888888888891

In [14]:
regfit.summary(3)

Regression output:
           Coefficient  S.E.       t-value    p-value   
Intercept  -1.5         1.658      -0.905     0.532     
X1         -4.0         1.414      -2.828     0.216     
X2         5.0          1.0        5.0        0.126     

Residuals:
[-0.5  0.5  0.5 -0.5]

R-squared: 0.963               	 Adjusted R-squared: 0.889
