# Assignment 1
## Feb. 13, 2023

In [1]:
import numpy as np
from numpy.polynomial.polynomial import Polynomial
import pandas as pd

## Set `np` seed

In [2]:
np.random.seed(1234)

### Function Definition

In [3]:
def generate_error(mean: float = 0,
                   sd: float = 1):
    """
    Generate normal error
    """
    return np.random.normal(loc = mean,
                             scale = sd)
    

def generate_response(n: int,
                      lower_bound: float = -3,
                      upper_bound: float = 3,
                      **kwargs):
    """
    Generate a response variable of a given size using a uniform x
    """
    if lower_bound >= upper_bound:
        raise ValueError("Lower bound must be lt upper_bound")
    
    # Generate n observations from uniform distribution, x ~ U(lower_bound, upper_bound)
    generated_predictor = np.random.uniform(low = lower_bound,
                                            high = upper_bound,
                                            size = n)
    
    # Generate response, y = 8 * sin(x) + error
    response_variable = [8 * np.sin(x) + generate_error(**kwargs) for x in generated_predictor]

    # Combine x and y to pd.DataFrame
    return_df = pd.DataFrame({"x": generated_predictor,
                              "y": response_variable})
    
    return return_df


def fit_polynomial(training_set: pd.DataFrame,
                   test_set: pd.DataFrame,
                   degree: int,
                   return_fit: bool = False):
    """
    Fit polynomial to auto-generated data, returning either just MSE or MSE and the fitted function
    """
    training_fit = Polynomial.fit(x = training_set['x'],
                                  y = training_set['y'],
                                  deg = degree)
    unshifted_fit = training_fit.convert()

    test_set['yhat'] = unshifted_fit(test_set['x'])
    squared_errors = (test_set['yhat'] - test_set['y']) ** 2
    mse = squared_errors.mean()

    if return_fit:
        unshifted_fit_str = [str(round(x, 3)) for x in unshifted_fit]
        return (mse, unshifted_fit_str)
    else:
        return mse

#### Fit a polynomial with training $n$ of 50 and test $n$ of 10000

In [4]:
training_set = generate_response(n = 50)
test_set = generate_response(n = 10000)

degree_3_mse_1 = fit_polynomial(training_set = training_set,
                                test_set = test_set,
                                degree = 3,
                                return_fit = True)

degree_15_mse_1 = fit_polynomial(training_set = training_set,
                                 test_set = test_set,
                                 degree = 15,
                                 return_fit = True)

#### Fit a polynomial with training $n$ of 10000 and test $n$ of 10000

In [5]:
training_set = generate_response(n = 10000)
test_set = generate_response(n = 10000)

degree_3_mse_2 = fit_polynomial(training_set = training_set,
                                test_set = test_set,
                                degree = 3,
                                return_fit = True)

degree_15_mse_2 = fit_polynomial(training_set = training_set,
                                 test_set = test_set,
                                 degree = 15,
                                 return_fit = True)

#### Collect results

In [6]:
all_fits = [degree_3_mse_1, degree_15_mse_1, degree_3_mse_2, degree_15_mse_2]

data_dict = {"Degree": [3, 15, 3, 15],
             "Training Set Size": [50, 50, 10000, 10000],
             "MSE": [x[0] for x in all_fits],
             "Fitted Prediction Function Coeffs": [x[1] for x in all_fits]}
summary_output = pd.DataFrame(data = data_dict)

In [7]:
summary_output

Unnamed: 0,Degree,Training Set Size,MSE,Fitted Prediction Function Coeffs
0,3,50,1.584334,"[0.253, 7.352, -0.124, -0.887]"
1,15,50,3.780628,"[0.455, 7.277, -2.253, -0.2, 2.411, 0.09, -0.5..."
2,3,10000,1.171233,"[0.003, 7.028, -0.006, -0.79]"
3,15,10000,0.991594,"[0.029, 8.075, -0.143, -1.765, 0.161, 0.645, -..."


- Based on the above, the best possible prediction rule $f$ is a degree of 15 and a training set size of 10000. This results in an MSE of 0.991594.
- The 4 MSEs of the combinations of Degree and Training Set Size are listed above. These make sense, as small training set sizes lead to a large amount of variance, in particular for fitting high degree polynomials that have a tendency to overfit. For larger training set sizes, this reduces the variance across sets, but the average prediction across the sets is also most likely farther from the true predicted value. 