# Regression modeling for the Increasing Returns paper

This Jupyter Notebook evaluates polynomial degrees 1-4.

In [3]:
# Import libraries and configure this notebook
import numpy as np
import pandas as pd
from scipy import stats

In [4]:
increasing_returns_df = pd.read_csv('./data/increasing_returns.csv', sep=',', decimal=".")
increasing_returns_df.describe()

Unnamed: 0,code_health,lead_time_minutes,total_defects
count,46211.0,46211.0,46211.0
mean,9.604727,7501.023,0.49683
std,1.070784,19808.85,2.089526
min,1.0,60.0,0.0
25%,9.75,1380.0,0.0
50%,10.0,3360.0,0.0
75%,10.0,7680.0,0.0
max,10.0,1452780.0,110.0


In [6]:
# Definitions ############################################
x_bugs=increasing_returns_df['code_health'].to_numpy()
x_bugs_reshaped=x_bugs.reshape(-1, 1)
y_bugs=increasing_returns_df['total_defects'].to_numpy()
x_time=increasing_returns_df[increasing_returns_df['lead_time_minutes'].notna()]['code_health'].to_numpy()
y_time=increasing_returns_df[increasing_returns_df['lead_time_minutes'].notna()]['lead_time_minutes'].to_numpy()
x_time_reshaped=x_time.reshape(-1, 1)

## Selecting polynomial degree

We explore polynomial regression models between degree one and four.

### First degree for defects

In [7]:
# Model ###############################
p, cov = np.polyfit(x_bugs, y_bugs, 1, cov=True)  # Using a 1st degree polynomial
PolReg_bugs_a, PolReg_bugs_b = p
PolReg_bugs_a, PolReg_bugs_b = np.round(PolReg_bugs_a, 3), np.round(PolReg_bugs_b, 3)

# Evaluate fit ###############################
# The value indicates the proportion of the variance in the dependent variable 
# that is predictable from the independent variables.
from sklearn.metrics import r2_score
y_pred_full = np.polyval(p, x_bugs)
r_squared = r2_score(y_bugs, y_pred_full)
print('R²:', r_squared)

# MSE is the average of the squares of the residuals, 
# and RMSE is its square root. They give an idea of how far the predictions are from the actual values.
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_bugs, y_pred_full)
rmse = np.sqrt(mse)
print('MSE:', mse)
print('RMSE:', rmse)

# MAE is the average of the absolute differences between predictions and actual observations.
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_bugs, y_pred_full)
print('MAE:', mae)

# Prediction ###############################
x_new=np.arange(1,11)
x_new_reshaped=x_new.reshape(-1, 1)
y_pred=np.polyval(p, x_new)


R²: 0.031893007852112576
MSE: 4.226777325185138
RMSE: 2.055912771784138
MAE: 0.7564276120435925


### Second degree for defects

In [8]:
# Model ###############################
p, cov = np.polyfit(x_bugs, y_bugs, 2, cov=True)  # Using a 2nd degree polynomial
PolReg_bugs_a, PolReg_bugs_b, PolReg_bugs_c = p
PolReg_bugs_a, PolReg_bugs_b, PolReg_bugs_c = np.round(PolReg_bugs_a, 3), np.round(PolReg_bugs_b, 3), np.round(PolReg_bugs_c, 3)

# Evaluate fit ###############################
# The value indicates the proportion of the variance in the dependent variable 
# that is predictable from the independent variables.
from sklearn.metrics import r2_score
y_pred_full = np.polyval(p, x_bugs)
r_squared = r2_score(y_bugs, y_pred_full)
print('R²:', r_squared)

# MSE is the average of the squares of the residuals, 
# and RMSE is its square root. They give an idea of how far the predictions are from the actual values.
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_bugs, y_pred_full)
rmse = np.sqrt(mse)
print('MSE:', mse)
print('RMSE:', rmse)

# MAE is the average of the absolute differences between predictions and actual observations.
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_bugs, y_pred_full)
print('MAE:', mae)

# Prediction ###############################
x_new=np.arange(1,11)
x_new_reshaped=x_new.reshape(-1, 1)
y_pred=np.polyval(p, x_new)



R²: 0.03201576755574176
MSE: 4.226241353504369
RMSE: 2.0557824188139096
MAE: 0.757431777172496


### Third degree for defects

In [9]:
# Model ###############################
p,cov=np.polyfit(x_bugs,y_bugs,3,cov=True)
PolReg_bugs_a,PolReg_bugs_b,PolReg_bugs_c,PolReg_bugs_d=p
PolReg_bugs_a,PolReg_bugs_b,PolReg_bugs_c,PolReg_bugs_d=np.round(PolReg_bugs_a,3),np.round(PolReg_bugs_b,3),np.round(PolReg_bugs_c,3),np.round(PolReg_bugs_d,3)

# Evaluate fit ###############################
# The value indicates the proportion of the variance in the dependent variable 
# that is predictable from the independent variables.
from sklearn.metrics import r2_score
y_pred_full = np.polyval(p, x_bugs)
r_squared = r2_score(y_bugs, y_pred_full)
print('R²:', r_squared)

# MSE is the average of the squares of the residuals, 
# and RMSE is its square root. They give an idea of how far the predictions are from the actual values.
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_bugs, y_pred_full)
rmse = np.sqrt(mse)
print('MSE:', mse)
print('RMSE:', rmse)

# MAE is the average of the absolute differences between predictions and actual observations.
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_bugs, y_pred_full)
print('MAE:', mae)

# Prediction ###############################
x_new=np.arange(1,11)
x_new_reshaped=x_new.reshape(-1, 1)
y_pred=np.polyval(p, x_new)


R²: 0.03818945337036017
MSE: 4.199286899682946
RMSE: 2.0492161671436584
MAE: 0.7477663699881618


### Fourth degree for defects

In [10]:
# Model ###############################
p, cov = np.polyfit(x_bugs, y_bugs, 4, cov=True)  # Using a 4th degree polynomial
PolReg_bugs_a, PolReg_bugs_b, PolReg_bugs_c, PolReg_bugs_d, PolReg_bugs_e = p
PolReg_bugs_a, PolReg_bugs_b, PolReg_bugs_c, PolReg_bugs_d, PolReg_bugs_e = np.round(PolReg_bugs_a, 3), np.round(PolReg_bugs_b, 3), np.round(PolReg_bugs_c, 3), np.round(PolReg_bugs_d, 3), np.round(PolReg_bugs_e, 3)

# Evaluate fit ###############################
# The value indicates the proportion of the variance in the dependent variable 
# that is predictable from the independent variables.
from sklearn.metrics import r2_score
y_pred_full = np.polyval(p, x_bugs)
r_squared = r2_score(y_bugs, y_pred_full)
print('R²:', r_squared)

# MSE is the average of the squares of the residuals, 
# and RMSE is its square root. They give an idea of how far the predictions are from the actual values.
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_bugs, y_pred_full)
rmse = np.sqrt(mse)
print('MSE:', mse)
print('RMSE:', rmse)

# MAE is the average of the absolute differences between predictions and actual observations.
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_bugs, y_pred_full)
print('MAE:', mae)

# Prediction ###############################
x_new=np.arange(1,11)
x_new_reshaped=x_new.reshape(-1, 1)
y_pred=np.polyval(p, x_new)

R²: 0.038305116128036
MSE: 4.198781913430933
RMSE: 2.0490929489486156
MAE: 0.7487997693111975


### First degree for time-in-development

In [11]:
# Model ###############################
p, cov = np.polyfit(x_time, y_time, 1, cov=True)
PolReg_time_a,PolReg_time_b=p
PolReg_time_a, PolReg_time_b = np.round(PolReg_time_a, 3), np.round(PolReg_time_b, 3)

# Evaluate fit ###############################
# The value indicates the proportion of the variance in the dependent variable 
# that is predictable from the independent variables.
from sklearn.metrics import r2_score
y_pred_full = np.polyval(p, x_time)
r_squared = r2_score(y_time, y_pred_full)
print('R²:', r_squared)

# MSE is the average of the squares of the residuals, 
# and RMSE is its square root. They give an idea of how far the predictions are from the actual values.
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_time, y_pred_full)
rmse = np.sqrt(mse)
print('MSE:', mse)
print('RMSE:', rmse)

# MAE is the average of the absolute differences between predictions and actual observations.
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_time, y_pred_full)
print('MAE:', mae)

# Prediction ###############################
x_new=np.arange(1,11)
x_new_reshaped=x_new.reshape(-1, 1)
y_pred=np.polyval(p, x_new)


R²: 0.005475034414668789
MSE: 390233645.82924205
RMSE: 19754.332330636793
MAE: 7202.704334439978


### Second degree for time-in-development

In [12]:
# Model ###############################
p, cov = np.polyfit(x_time, y_time, 2, cov=True)
PolReg_time_a, PolReg_time_b, PolReg_time_c = p
PolReg_time_a, PolReg_time_b, PolReg_time_c = np.round(PolReg_time_a, 3), np.round(PolReg_time_b, 3), np.round(PolReg_time_c, 3)

# Evaluate fit ###############################
# The value indicates the proportion of the variance in the dependent variable 
# that is predictable from the independent variables.
from sklearn.metrics import r2_score
y_pred_full = np.polyval(p, x_time)
r_squared = r2_score(y_time, y_pred_full)
print('R²:', r_squared)

# MSE is the average of the squares of the residuals, 
# and RMSE is its square root. They give an idea of how far the predictions are from the actual values.
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_time, y_pred_full)
rmse = np.sqrt(mse)
print('MSE:', mse)
print('RMSE:', rmse)

# MAE is the average of the absolute differences between predictions and actual observations.
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_time, y_pred_full)
print('MAE:', mae)

# Prediction ###############################
x_new=np.arange(1,11)
x_new_reshaped=x_new.reshape(-1, 1)
y_pred=np.polyval(p, x_new)


R²: 0.005941675929300283
MSE: 390050544.1215411
RMSE: 19749.69731721327
MAE: 7209.650392941731


### Third degree for time-in-development

In [13]:
# Model ###############################
p, cov = np.polyfit(x_time, y_time, 3, cov=True)
PolReg_time_a,PolReg_time_b,PolReg_time_c,PolReg_time_d=p
PolReg_time_a,PolReg_time_b,PolReg_time_c,PolReg_time_d=np.round(PolReg_time_a,3),np.round(PolReg_time_b,3),np.round(PolReg_time_c,3),np.round(PolReg_time_d,3)

# Evaluate fit ###############################
# The value indicates the proportion of the variance in the dependent variable 
# that is predictable from the independent variables.
from sklearn.metrics import r2_score
y_pred_full = np.polyval(p, x_time)
r_squared = r2_score(y_time, y_pred_full)
print('R²:', r_squared)

# MSE is the average of the squares of the residuals, 
# and RMSE is its square root. They give an idea of how far the predictions are from the actual values.
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_time, y_pred_full)
rmse = np.sqrt(mse)
print('MSE:', mse)
print('RMSE:', rmse)

# MAE is the average of the absolute differences between predictions and actual observations.
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_time, y_pred_full)
print('MAE:', mae)

# Prediction ###############################
x_new=np.arange(1,11)
x_new_reshaped=x_new.reshape(-1, 1)
y_pred=np.polyval(p, x_new)


R²: 0.006051479304528429
MSE: 390007459.25899625
RMSE: 19748.60651435934
MAE: 7209.310835179671


### Fourth degree for time-in-development

In [14]:
# Model ###############################
p, cov = np.polyfit(x_time, y_time, 4, cov=True)
PolReg_time_a, PolReg_time_b, PolReg_time_c, PolReg_time_d, PolReg_time_e = p
PolReg_time_a, PolReg_time_b, PolReg_time_c, PolReg_time_d, PolReg_time_e = np.round(PolReg_time_a, 3), np.round(PolReg_time_b, 3), np.round(PolReg_time_c, 3), np.round(PolReg_time_d, 3), np.round(PolReg_time_e, 3)

# Evaluate fit ###############################
# The value indicates the proportion of the variance in the dependent variable 
# that is predictable from the independent variables.
from sklearn.metrics import r2_score
y_pred_full = np.polyval(p, x_time)
r_squared = r2_score(y_time, y_pred_full)
print('R²:', r_squared)

# MSE is the average of the squares of the residuals, 
# and RMSE is its square root. They give an idea of how far the predictions are from the actual values.
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_time, y_pred_full)
rmse = np.sqrt(mse)
print('MSE:', mse)
print('RMSE:', rmse)

# MAE is the average of the absolute differences between predictions and actual observations.
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_time, y_pred_full)
print('MAE:', mae)

# Prediction ###############################
x_new=np.arange(1,11)
x_new_reshaped=x_new.reshape(-1, 1)
y_pred=np.polyval(p, x_new)


R²: 0.006053296885630499
MSE: 390006746.07297826
RMSE: 19748.58845773485
MAE: 7209.072792540109


We decide to continue with the third degree polynomial.