In [2]:
import pandas as pd
data = pd.read_csv("Origin_Basis_CalculatedcleanedData.csv",
                  index_col = "Date")
data

Unnamed: 0_level_0,"Ayr, ND","Alberta, MN","Jasper, MN","Ida Grove, IA","St. Joseph, MO",Barreirias,Sorriso,Rio Verde,Ponta Grossa,Rondonopolis
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2013-01-02,-0.5350,-0.2050,-0.4050,-0.3350,0.0850,1.6550,-2.3850,-1.2450,1.0550,-0.8050
2013-01-07,-0.5675,-0.4775,-0.4875,-0.3875,-0.0075,0.3125,-3.4675,-0.9075,0.6325,-2.1775
2013-01-08,-0.6175,-0.5275,-0.5375,-0.4375,-0.0575,-0.2975,-3.5175,-1.7975,-0.2375,-2.4675
2013-01-11,-0.8175,-0.7675,-0.7675,-0.6775,-0.2975,-0.0175,-3.6975,-1.7275,-0.4075,-2.8875
2013-01-14,-0.7175,-0.5675,-0.6675,-0.5775,-0.1975,-0.1575,-3.8775,-2.3975,-0.6475,-2.7475
...,...,...,...,...,...,...,...,...,...,...
2019-12-26,-0.9975,-0.6875,-0.5675,-0.5375,-0.2675,-0.1075,-0.6275,-0.0175,0.0125,-0.2575
2019-12-27,-0.9950,-0.6850,-0.5450,-0.5050,-0.1850,0.0350,-0.5250,0.0850,0.0950,-0.1550
2019-12-30,-0.9950,-0.6450,-0.5450,-0.4950,-0.2050,0.0150,-0.5450,0.0650,0.1650,-0.1750
2020-01-02,-0.8825,-0.6325,-0.5325,-0.5025,-0.2425,-0.0725,-0.6325,-0.0225,0.0975,-0.2625


In [3]:
import pandas as pd
from stats import *
import numpy as np
from scipy.stats import t, f

class Regression:
    def __init__(self):
        self.stats = stats()
        self.reg_history = {}
        
    def OLS(self, reg_name, data, y_name, beta_names, min_val = 0,
                max_val = None, constant = True):
        self.min_val = min_val
        if max_val != None:
            self.max_val = max_val
        else:
            self.max_val = len(data)
        self.reg_name = reg_name
        self.y_name = y_name
        self.beta_names = beta_names
        self.data = data.copy()
        if constant:
            self.add_constant()
        self.build_matrices()
        self.estimate_betas_and_yhat()
        self.calculate_regression_stats()
        self.save_output()
        
    def add_constant(self):
        self.data["Constant"] = 1
        self.beta_names.append("Constant")
        
    def build_matrices(self):
        #transform dataframes to matrices
        self.y = np.matrix(self.data[self.y_name][self.min_val:self.max_val])
        #create a k X n nested list containg vectors for each exog var
        self.X = np.matrix(self.data[self.beta_names])
        self.X_transpose = np.matrix(self.X).getT()
        #(X'X)**-1
        X_transp_X = np.matmul(self.X_transpose, self.X)
        self.X_transp_X_inv = X_transp_X.getI()
        #X'y
        self.X_transp_y = np.matmul(self.X_transpose, self.y)
    
    def estimate_betas_and_yhat(self):
        #betas = (X'X)**-1 * X'y
        self.betas = np.matmul(self.X_transp_X_inv, self.X_transp_y)
        #y_hat = X * betas
        self.y_hat = np.matmul(self.X, self.betas)
        #create a column that holds y_hat values
        self.data[self.y_name[0] + " estimator"] = \
            [i.item(0) for i in self.y_hat]
        #create a table that holds the estimated coefficient
        #as well as the standard errors, tstats,and pvalues
        self.estimates = pd.DataFrame(self.betas, index = self.beta_names,
                                      columns = ["Coefficient"])
        #id y variable in index
        self.estimates.index.name = "y = " + self.y_name[0]
        
    def calculate_regression_stats(self):
        self.sum_square_stats()
        self.calculate_degrees_of_freedom()
        self.calculate_estimator_variance()
        self.calculate_covariance_matrix()
        self.calculate_t_p_error_stats()
        self.calculate_root_MSE()
        self.calculate_rsquared()
        self.calculate_fstat()
        self.build_stats_DF()
    
    def sum_square_stats(self):
        ssr_list = []
        sse_list = []
        sst_list = []
        mean_y = self.stats.mean(self.y).item(0)
        for i in range(len(self.y)):
            #ssr is sum of squared distances between the estimated y values
            #(y_hat) and the average of y values (y_bar or simply mean_y)
            yhat_i = self.y_hat[i]
            y_i = self.y[i]
            ssr_list.append((yhat_i - mean_y) ** 2)
            sse_list.append((y_i - yhat_i) ** 2)
            sst_list.append((y_i - mean_y) ** 2)
            #calling the item calls the value instead of matrix
            self.ssr = self.stats.total(ssr_list).item(0)
            self.sst = self.stats.total(sst_list).item(0)
            self.sse = self.stats.total(sse_list).item(0)

    def calculate_degrees_of_freedom(self):
        #df compares the num of observs to the num
        #exog variables used to form the prediction
        self.lost_degrees_of_freedom = len(self.estimates)
        self.num_obs = self.max_val + 1 - self.min_val
        self.degrees_of_freedom =  self.num_obs - self.lost_degrees_of_freedom
        
    def calculate_estimator_variance(self):
        #estimator variance is the sse normalized by the degrees of freedom
        #so it increases as the number of exog vars used in estimating increase
        #that is, as degrees of freedom fall
        self.estimator_variance = self.sse / self.degrees_of_freedom
        
    def calculate_covariance_matrix(self):
        #cov matrix will be used to estimate standard errors for ea coeff
        #estimator var * (X'X)**-1 is the cov matrix
        self.cov_matrix = float(self.estimator_variance) * self.X_transp_X_inv
        self.cov_matrix = pd.DataFrame(self.cov_matrix, 
                                       columns = self.beta_names,
                                       index = self.beta_names)
    
    def calculate_t_p_error_stats(self):
        self.rating_dict = {.05:"*",
                       .01:"**",
                       .001: "***"}
        results = self.estimates
        stat_sig_names = ["SE", "t-stat", "p-value"]
        for stat_name in stat_sig_names: 
            results[stat_name] = np.nan
        #generate statistic for each variable
        for var in self.beta_names:
            #SE of coefficient is found in the diagonal of cov_matrix
            results.loc[var]["SE"] = self.cov_matrix[var][var] ** (1/2)
            #tstat = Coeff / SE
            results.loc[var]["t-stat"] = \
                results["Coefficient"][var] / results["SE"][var]
            #p-value is estimated using a  table that transforms t-value in refference to df
            results.loc[var]["p-value"] = np.round(t.sf(np.abs(results.\
                       loc[var]["t-stat"]),self.degrees_of_freedom + 1) * 2, 5)
        #values for signifiances will be blank unless p-value < .05
        #pandas does not allow np.nan values or default blank strings to be replaced
        significance = ["" for i in range(len(self.beta_names))]   
        for i in range(len(self.beta_names)):
            var = self.beta_names[i]
            for val in self.rating_dict:
                if results.loc[var]["p-value"] < val:
                    significance[i] = self.rating_dict[val]
                    print(var, self.rating_dict[val])  
        results["significance"] = significance
        
    def calculate_root_MSE(self):
        self.root_mse = self.estimator_variance ** (1/2)
    
    def calculate_rsquared(self):
        self.r_sq = self.ssr / self.sst
        self.adj_r_sq = 1 - self.sse / self.degrees_of_freedom / (self.sst\
                             / (self.num_obs - 1))
    
    def calculate_fstat(self):
        self.f_stat = (self.sst - self.sse) / (self.lost_degrees_of_freedom\
                       - 1) / self.estimator_variance
    
    def build_stats_DF(self):
        stats_dict = {"r**2": [self.r_sq],
                      "Adj. r**2": [self.adj_r_sq],
                      "f-stat": [self.f_stat],
                      "EST Var": [self.estimator_variance],
                      "MSE": [self.root_mse],
                      "SSE": [self.sse],
                      "SSR": [self.ssr],
                      "SST": [self.sst],
                      "Obs.": [self.num_obs],
                      "DOF":[self.degrees_of_freedom]}
        self.stats_DF = pd.DataFrame(stats_dict)
        self.stats_DF = self.stats_DF.rename(index={0:"Estimation Statistics"})
        self.stats_DF = self.stats_DF.T

    def save_output(self):
        self.reg_history[self.reg_name] = {}
        self.reg_history[self.reg_name]["Reg Stats"] = self.stats_DF.copy()
        self.reg_history[self.reg_name]["Estimates"]= self.estimates.copy()
        self.reg_history[self.reg_name]["Cov Matrix"] = self.cov_matrix.copy()

In [4]:
#from regression import Regresssion
reg = Regression()
print(reg)

You created an instance of stats()
<__main__.Regression object at 0x0000020852F6ED00>
