In [1]:
#----------------------------------------
#R2 (CoeffDet1 Sheet)
#----------------------------------------

import pandas as pd
import statsmodels.api as sm
from scipy.stats import f

# --- Function to compute critical F and decision ---
def calculate_critical_F_and_decision(F_statistic, n, k, significance_level):
    numerator_df = k
    denominator_df = n - k - 1
    critical_F = f.ppf(1 - significance_level, numerator_df, denominator_df)

    print(f"\nDegrees of freedom:")
    print(f"  Numerator (k): {numerator_df}")
    print(f"  Denominator (n-k-1): {denominator_df}")
    print(f"Critical F value at α={significance_level}: {critical_F:.4f}")
    print(f"Observed F statistic: {F_statistic:.4f}")

    if F_statistic > critical_F:
        print("Decision (Critical F comparison): Reject H₀ (significant relationship)")
        print("Because the F-test statistic is greater than the critical F-score, reject H₀ and conclude that at least one population coefficient of the independent variables is not equal to zero.")
    else:
        print("Decision (Critical F comparison): Fail to reject H₀ (no significant relationship)")

    return critical_F


# --- Step A: Read dataset ---
file_path = r"C:\Users\300393449\OneDrive - Douglas College\Documents\4th Semester\2_Business_Statistics_II\Python\DataScratch.xlsx"
sheet_name = "CoeffDet"

df = pd.read_excel(file_path, sheet_name=sheet_name)
print("Dataset:")
print(df)

# --- Step B: Prepare data and fit model ---
y = df['y']
X = df.drop(columns=['y'])
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()

# --- Step C: Compute SST, SSR, SSE ---
y_mean = y.mean()
SSR = sum((model.fittedvalues - y_mean) ** 2)   # Regression sum of squares
SSE = sum((y - model.fittedvalues) ** 2)        # Error sum of squares
SST = SSR + SSE                                 # Total sum of squares

print(f"\nSST (Total Sum of Squares)       : {SST:.4f}")
print(f"SSR (Regression Sum of Squares)  : {SSR:.4f}")
print(f"SSE (Error Sum of Squares)       : {SSE:.4f}")
print(f"R-squared from model             : {model.rsquared:.4f}")
print(f"Adjusted R-squared from model    : {model.rsquared_adj:.4f}")
print(f"R-squared from formula           : {SSR/SST:.4f}")

# --- Hypotheses ---
print("\nHypotheses:")
print("H₀: β₁ = β₂ = 0  (No relationship between the variables)")
print("H₁: At least one βᵢ ≠ 0  (Relationship exists between the variables)")

# --- Step D: Ask for significance level ---
significance_level = float(input("\nEnter significance level (e.g., 0.05): "))

# --- Step E: Calculate F-score manually ---
k = X.shape[1] - 1  # number of predictors (excluding constant)
n = len(y)         # number of observations
MSR = SSR / k      # Mean Squared Regression
MSE = SSE / (n - k - 1)  # Mean Squared Error
F_score_manual = MSR / MSE

# --- Step F: Get F-statistic and p-value from model ---
F_statistic = model.fvalue
p_value = model.f_pvalue

print(f"\nManual F-score: {F_score_manual:.4f}")
print(f"F-test Statistic (from model): {F_statistic:.4f}")
print(f"p-value: {p_value:.4f}")

# --- Step G: Decision using p-value ---
if p_value < significance_level:
    print(f"Decision (p-value): Reject H₀ at α={significance_level}")
    print("Since the p-value is less than alpha, reject the null hypothesis. The overall regression model is significant at the α = {:.2f} significance level.".format(significance_level))
else:
    print(f"Decision (p-value): Fail to reject H₀ at α={significance_level}")

# --- Step H: Decision using Critical F ---
calculate_critical_F_and_decision(F_statistic, n, k, significance_level)


Dataset:
    y  x1  x2
0  47  75  22
1  43  64  30
2  40  77  19
3  41  53  17
4  32  45  13
5  25  47  17
6  22  36   9
7  19  16  15
8  10  16   9

SST (Total Sum of Squares)       : 1284.0000
SSR (Regression Sum of Squares)  : 1130.7493
SSE (Error Sum of Squares)       : 153.2507
R-squared from model             : 0.8806
Adjusted R-squared from model    : 0.8409
R-squared from formula           : 0.8806

Hypotheses:
H₀: β₁ = β₂ = 0  (No relationship between the variables)
H₁: At least one βᵢ ≠ 0  (Relationship exists between the variables)



Enter significance level (e.g., 0.05):  .05



Manual F-score: 22.1353
F-test Statistic (from model): 22.1353
p-value: 0.0017
Decision (p-value): Reject H₀ at α=0.05
Since the p-value is less than alpha, reject the null hypothesis. The overall regression model is significant at the α = 0.05 significance level.

Degrees of freedom:
  Numerator (k): 2
  Denominator (n-k-1): 6
Critical F value at α=0.05: 5.1433
Observed F statistic: 22.1353
Decision (Critical F comparison): Reject H₀ (significant relationship)
Because the F-test statistic is greater than the critical F-score, reject H₀ and conclude that at least one population coefficient of the independent variables is not equal to zero.


5.143252849784718

In [9]:
#----------------------------------------
#R2 (CoeffDet2 Sheet)
#----------------------------------------

import pandas as pd
from scipy import stats

# Load Excel file
file_path = r"C:\Users\300393449\OneDrive - Douglas College\Documents\4th Semester\2_Business_Statistics_II\Python\DataScratch.xlsx"
sheet_name = "CoeffDet2"

# Read data
df = pd.read_excel(file_path, sheet_name=sheet_name)
print("Dataset:")
print(df)

# Extract values from the table
df_reg = df.loc[df['Source'] == 'Regression', 'df'].values[0]
df_res = df.loc[df['Source'] == 'Residual', 'df'].values[0]
SS_reg = df.loc[df['Source'] == 'Regression', 'SS'].values[0]
SS_tot = df.loc[df['Source'] == 'Total', 'SS'].values[0]
F_stat = df.loc[df['Source'] == 'Regression', 'F'].values[0]

# Compute statistics
sample_size = df_reg + df_res + 1   # +1 because Total df = N - 1
num_independent_vars = df_reg
R2 = SS_reg / SS_tot
p_value = 1 - stats.f.cdf(F_stat, df_reg, df_res)  # p-value from F distribution
adj_R2 = 1 - (1 - R2) * ((df_res) / (df_res - num_independent_vars))

# Ask for significance level
alpha = float(input("Enter the significance level (alpha): "))

# Print results
print("\n--- Regression Model Summary ---")
print(f"Sample size: {sample_size}")
print(f"Number of independent variables: {num_independent_vars}")
print(f"R-squared: {R2:.4f}")
print(f"F-statistic: {F_stat}")
print(f"p-value: {p_value:.4e}")
print(f"Adjusted R-squared: {adj_R2:.4f}")

# Hypothesis test result
if p_value < alpha:
    print(f"Reject the null hypothesis: The model is significant at alpha = {alpha}")
else:
    print(f"Fail to reject the null: The model is NOT significant at alpha = {alpha}")


Dataset:
       Source  df    SS      MS     F
0  Regression   5  5625  1125.0  15.0
1    Residual  25  1875    75.0   NaN
2       Total  30  7500     NaN   NaN


Enter the significance level (alpha):  .05



--- Regression Model Summary ---
Sample size: 31
Number of independent variables: 5
R-squared: 0.7500
F-statistic: 15.0
p-value: 7.6993e-07
Adjusted R-squared: 0.6875
Reject the null hypothesis: The model is significant at alpha = 0.05


In [4]:
#----------------------------------------
#REGRESSION MODEL + R2 (Reg_Model Sheet)
#----------------------------------------
import pandas as pd
import statsmodels.api as sm
from scipy.stats import f, t

# Step 1: Load dataset
file_path = r"C:\Users\300393449\OneDrive - Douglas College\Documents\4th Semester\2_Business_Statistics_II\Python\DataScratch.xlsx"
sheet_name = "Reg_Model"
df = pd.read_excel(file_path, sheet_name=sheet_name)
print("Dataset:")
print(df)

# Step 2: Build model with all predictors
y = df['y']
X = df.drop(columns=['y'])
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()

print("\n= Regression Summary =")
print(model.summary())

# Step 3: Model statistics
r2 = model.rsquared
adj_r2 = model.rsquared_adj
f_stat = model.fvalue
f_pvalue = model.f_pvalue
print(f"\nR-squared: {r2:.4f}")
print(f"Adjusted R-squared: {adj_r2:.4f}")
print(f"F-statistic: {f_stat:.4f}")
print(f"F-test p-value: {f_pvalue:.4f}")

# Step 4: Ask for significance level
while True:
    try:
        alpha = float(input("\nEnter significance level (e.g., 0.05 for 5%): "))
        if 0 < alpha < 1:
            break
        else:
            print("Please enter a decimal between 0 and 1.")
    except ValueError:
        print("Invalid input. Try again.")

# Step 5: Hypotheses for overall F-test
print("\nOverall Model Hypotheses:")
print("H0: β1 = β2 = β3 = ... = 0 (No relationship between the variables)")
print("H1: At least one βi ≠ 0 (Relationship exists between the variables)")

# Degrees of freedom
df_model = int(model.df_model)      # numerator df
df_resid = int(model.df_resid)      # denominator df

# F-critical value
f_critical = f.ppf(1 - alpha, df_model, df_resid)

print(f"\nF-test statistic: {f_stat:.4f}")
print(f"F-critical value ({(1-alpha)*100:.0f}% confidence): {f_critical:.4f}")
print(f"F-test p-value: {f_pvalue:.4f}")
print(f"Degrees of freedom: df1 = {df_model}, df2 = {df_resid}")

if f_stat > f_critical:
    print(f"Decision (critical value method): Reject H0 (F = {f_stat:.4f} > {f_critical:.4f})")
else:
    print(f"Decision (critical value method): Fail to reject H0 (F = {f_stat:.4f} ≤ {f_critical:.4f})")

if f_pvalue < alpha:
    print(f"Decision (p-value method): Reject H0 at alpha = {alpha}.")
    print("Conclusion: At least one predictor is significantly related to the response variable.")
else:
    print(f"Decision (p-value method): Fail to reject H0 at alpha = {alpha}.")
    print("Conclusion: No significant evidence that predictors relate to the response variable.")

# Step 6: t-tests for individual coefficients
print("\n= Individual Coefficient t-tests =")
params = model.params        # Coefficient estimates
t_stats = model.tvalues      # t-statistics
p_values = model.pvalues     # p-values
df_t = df_resid               # degrees of freedom for t
t_critical = t.ppf(1 - alpha/2, df_t)

for var in params.index:
    t_val = t_stats[var]
    p_val = p_values[var]
    print(f"\nVariable: {var}")
    print(f"  Coefficient Estimate: {params[var]:.4f}")
    print(f"  t-statistic: {t_val:.4f}")
    print(f"  t-critical ({(1-alpha)*100:.0f}% CI): {t_critical:.4f}")
    print(f"  p-value: {p_val:.4f}")
    if abs(t_val) > t_critical:
        print(f"  Decision (critical value method): Reject H0 (|t| = {abs(t_val):.4f} > {t_critical:.4f})")
    else:
        print(f"  Decision (critical value method): Fail to reject H0")
    if p_val < alpha:
        print(f"  Decision (p-value method): Reject H0 at alpha = {alpha}")
    else:
        print(f"  Decision (p-value method): Fail to reject H0")

# Step 6B: Confidence Interval for selected predictor
print("\nAvailable predictors (from your data):")
for var in model.params.index:
    if var != "const":   # skip intercept
        print(f" - {var}")

pred_name = input("\nEnter the predictor name you want the CI for (e.g., x1, x2, x3): ").strip()

if pred_name in model.params.index:
    coef = model.params[pred_name]
    se   = model.bse[pred_name]
    t_crit = t.ppf(1 - alpha/2, df_resid)

    LCL = coef - t_crit * se
    UCL = coef + t_crit * se

    print(f"\n{(1-alpha)*100:.0f}% Confidence Interval for {pred_name} coefficient:")
    print(f"LCL: {LCL:.2f}")
    print(f"UCL: {UCL:.2f}")

    # Interpretation for slope meaning
    print("\nInterpretation (Effect meaning):")
    print(f"If all other variables are held constant, one can be {(1-alpha)*100:.0f}% "
          f"confident that changing {pred_name} by 1 unit will change the response variable "
          f"by between {LCL:.2f} and {UCL:.2f} units.")

    # Significance interpretation (Part e)
    print("\nInterpretation (Statistical significance):")
    if LCL <= 0 <= UCL:
        print(f"Since the confidence interval includes 0, there is insufficient evidence "
              f"that the true population coefficient for {pred_name} is not 0, "
              f"which indicates there is not a statistically significant relationship between "
              f"{pred_name} and the response variable.")
    else:
        print(f"Since the confidence interval does not include 0, there is sufficient evidence "
              f"that the true population coefficient for {pred_name} is not 0, "
              f"indicating a statistically significant relationship between "
              f"{pred_name} and the response variable.")
else:
    print(f"Predictor '{pred_name}' not found in model.")


# Step 7: CI & PI using mean predictor values
mean_values = X.mean()
new_df = pd.DataFrame([mean_values])
pred_frame = model.get_prediction(new_df).summary_frame(alpha=alpha)

mean_pred = round(pred_frame['mean'][0], 2)
mean_ci_lower = round(pred_frame['mean_ci_lower'][0], 2)
mean_ci_upper = round(pred_frame['mean_ci_upper'][0], 2)
pred_ci_lower = round(pred_frame['obs_ci_lower'][0], 2)
pred_ci_upper = round(pred_frame['obs_ci_upper'][0], 2)

print(f"\nPrediction at mean predictor values: {mean_pred}")
print(f"{(1-alpha)*100:.0f}% Confidence Interval (mean y): {mean_ci_lower} to {mean_ci_upper}")
print(f"{(1-alpha)*100:.0f}% Prediction Interval (individual y): {pred_ci_lower} to {pred_ci_upper}")

print(f"\nMeaning of CI: We are {(1-alpha)*100:.0f}% confident that the average value of y "
      f"for the mean predictor values lies between {mean_ci_lower} and {mean_ci_upper}.")
print(f"Meaning of PI: For an individual case with mean predictor values, "
      f"we expect y to fall between {pred_ci_lower} and {pred_ci_upper} {(1-alpha)*100:.0f}% of the time.")


Dataset:
         y  x1    x2
0   0.2454   1  60.0
1   0.2569   1  60.0
2   0.4984   1  60.0
3   0.2531   2  60.0
4   0.3143   2  60.0
5   0.5316   2  60.0
6   0.5612   3  60.0
7   1.0346   3  60.0
8   1.3985   3  60.0
9   0.2466   1  72.5
10  0.2581   1  72.5
11  0.2566   1  72.5
12  0.2437   2  72.5
13  0.2703   2  72.5
14  0.4845   2  72.5
15  0.3027   3  72.5
16  0.8672   3  72.5
17  1.2242   3  72.5
18  0.2519   1  85.0
19  0.2546   1  85.0
20  0.2496   1  85.0
21  0.2545   2  85.0
22  0.2562   2  85.0
23  0.2904   2  85.0
24  0.2504   3  85.0
25  0.2528   3  85.0
26  0.3007   3  85.0

= Regression Summary =
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.432
Model:                            OLS   Adj. R-squared:                  0.385
Method:                 Least Squares   F-statistic:                     9.137
Date:                Thu, 14 Aug 2025   Prob (F-statistic):    


Enter significance level (e.g., 0.05 for 5%):  .05



Overall Model Hypotheses:
H0: β1 = β2 = β3 = ... = 0 (No relationship between the variables)
H1: At least one βi ≠ 0 (Relationship exists between the variables)

F-test statistic: 9.1367
F-critical value (95% confidence): 3.4028
F-test p-value: 0.0011
Degrees of freedom: df1 = 2, df2 = 24
Decision (critical value method): Reject H0 (F = 9.1367 > 3.4028)
Decision (p-value method): Reject H0 at alpha = 0.05.
Conclusion: At least one predictor is significantly related to the response variable.

= Individual Coefficient t-tests =

Variable: const
  Coefficient Estimate: 0.9023
  t-statistic: 2.4497
  t-critical (95% CI): 2.0639
  p-value: 0.0220
  Decision (critical value method): Reject H0 (|t| = 2.4497 > 2.0639)
  Decision (p-value method): Reject H0 at alpha = 0.05

Variable: x1
  Coefficient Estimate: 0.2041
  t-statistic: 3.4300
  t-critical (95% CI): 2.0639
  p-value: 0.0022
  Decision (critical value method): Reject H0 (|t| = 3.4300 > 2.0639)
  Decision (p-value method): Reject H0 


Enter the predictor name you want the CI for (e.g., x1, x2, x3):  x1



95% Confidence Interval for x1 coefficient:
LCL: 0.08
UCL: 0.33

Interpretation (Effect meaning):
If all other variables are held constant, one can be 95% confident that changing x1 by 1 unit will change the response variable by between 0.08 and 0.33 units.

Interpretation (Statistical significance):
Since the confidence interval does not include 0, there is sufficient evidence that the true population coefficient for x1 is not 0, indicating a statistically significant relationship between x1 and the response variable.

Prediction at mean predictor values: 0.43
95% Confidence Interval (mean y): 0.33 to 0.53
95% Prediction Interval (individual y): -0.1 to 0.96

Meaning of CI: We are 95% confident that the average value of y for the mean predictor values lies between 0.33 and 0.53.
Meaning of PI: For an individual case with mean predictor values, we expect y to fall between -0.1 and 0.96 95% of the time.
