# Wiggly curves + Prediction + Bias-variance tradeoff

In [8]:
# !pip uninstall -y matplotlib
# !pip install matplotlib
!pip uninstall -y pandas
!pip install lxml
!pip install pandas

Found existing installation: pandas 2.2.3
Uninstalling pandas-2.2.3:
  Successfully uninstalled pandas-2.2.3
Collecting lxml
  Downloading lxml-5.3.0-cp311-cp311-macosx_10_9_universal2.whl (8.1 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.1/8.1 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m31m22.2 MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: lxml
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pandas-datareader 0.10.0 requires pandas>=0.23, which is not installed.
yfinance 0.2.44 requires pandas>=1.3.0, which is not installed.[0m[31m
[0mSuccessfully installed lxml-5.3.0
Collecting pandas
  Using cached pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl (11.3 MB)
Installing collected packages: pandas
Successfully installed pandas-2.2.3


In [9]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

Read in the ERCOT power demand data

In [10]:
data = pd.read_csv('../data/ERCOT_data.csv')

AttributeError: module 'pandas' has no attribute 'read_csv'

Scatter plot power demand and temperature!

In [None]:
x_data = data['KHOU']
y_data = data['COAST']
plt.scatter(x_data,y_data,s=7,alpha=0.25,c="black")
plt.xlabel('Temperature (F)')
plt.ylabel('Power demand (GW)')
plt.title('Scatter Plot of COAST vs KHOU')
plt.show()

In [None]:
# Fit a polynomial of degree 1
slope, intercept  = np.polyfit(x_data, y_data, 1)
x = np.linspace(20,110,500)
line = slope * x + intercept 

# Plot original scatterplot
plt.scatter(x_data,y_data,s=7,alpha=0.25,c="gray")

# Plot best first-order fit
plt.plot(x, line,label = '1st order',color="blue",lw=2)
plt.xlabel('Temperature (F)')
plt.ylabel('Power demand (GW)')
plt.title('Scatter Plot of COAST vs KHOU')

In [None]:
def plot_polyfit(x_data, y_data, degree):
    
    # Fit a polynomial of the specified degree
    coeffs = np.polyfit(x_data, y_data, degree)
    poly_func = np.poly1d(coeffs)  # Create a polynomial function from the coefficients
    
    # Generate points for plotting the polynomial fit
    x = np.linspace(20,110,500)
    
    # Generate y values for the fitted line
    y_fit = poly_func(x)
    
    # Plot original scatterplot
    plt.scatter(x_data, y_data, s=7, alpha=0.25, c="gray")
    
    # Plot the polynomial fit line
    plt.plot(x, y_fit, label=f'{degree} degree fit', color="blue", lw=2)
    plt.xlabel("x_data")
    plt.ylabel("y_data")
    plt.title(f"Polynomial Fit of Degree {degree}")
    plt.xlabel('Temperature (F)')
    plt.ylabel('Power demand (GW)')
    plt.title('Scatter Plot of COAST vs KHOU')
    plt.legend()
    plt.show()

# Example usage with different degrees
plot_polyfit(x_data, y_data, 1)  # Linear fit
plot_polyfit(x_data, y_data, 2)  # Quadratic fit
plot_polyfit(x_data, y_data, 5)  # Cubic fit

In [None]:
# Define the train-test split ratio
test_ratio = 0.2
n_test = int(len(x_data) * test_ratio)

# Shuffle the indices and split
indices = np.random.permutation(len(x_data))
train_indices = indices[n_test:]
test_indices = indices[:n_test]

# Split the data
x_train, y_train = x_data[train_indices], y_data[train_indices]
x_test, y_test = x_data[test_indices], y_data[test_indices]

# Fit a polynomial of the desired degree on the training data
degree = 10  # Adjust degree as desired
# coeffs = np.polyfit(x_train, y_train, degree)
# poly_func = np.poly1d(coeffs)  # Create a polynomial function from the coefficients
spline_params = splrep(x_train, y_train, k=10)

# Predict on test data
# y_train_pred = poly_func(x_train)
# y_test_pred = poly_func(x_test)

y_test_pred = splev(x_test, spline_params)
y_train_pred = splev(x_train, spline_params)


# Plot training data, test data, and polynomial fit
plt.scatter(x_train, y_train, color="lightblue", s=7, alpha=0.5,label="Training Data")
plt.scatter(x_test, y_test, color="pink",s=7, alpha=0.5, label="Test Data")
plt.plot(np.sort(x_train), poly_func(np.sort(x_train)),color="blue", label=f'{degree} degree fit on Train Data', lw=2)

plt.xlabel("x_data")
plt.ylabel("y_data")
plt.title(f"Polynomial Fit of Degree {degree} with Train-Test Split")
plt.legend()
plt.show()

In [None]:
def root_mean_squared_error(y_true, y_pred):
    # Calculate the squared differences
    squared_errors = (y_true - y_pred) ** 2
    
    # Calculate and return the mean of squared differences
    return np.sqrt(np.mean(squared_errors))

In [None]:
# Fit a polynomial of the desired degree on the training data
degree = 8  # Adjust degree as desired
coeffs = np.polyfit(x_train, y_train, degree)
poly_func = np.poly1d(coeffs)  # Create a polynomial function from the coefficients

# Predict on test data
y_train_pred = poly_func(x_train)
y_test_pred = poly_func(x_test)
# Calculate MSE on test data
test_mse = root_mean_squared_error(y_test, y_test_pred)
print("Root Mean Squared Error on Test Data:", test_mse)

In [None]:
# Initialize lists to store degrees and corresponding test MSE values
degrees = range(12)
test_mse_list = []

# Loop over polynomial degrees from 0 through 10
for degree in degrees:
    # Fit a polynomial of the current degree on the training data
    coeffs = np.polyfit(x_train, y_train, degree+4)
    poly_func = np.poly1d(coeffs)
    
    # Predict on test data
    y_test_pred = poly_func(x_test)
    
    # Calculate Mean Squared Error on test data and store it
    test_mse = root_mean_squared_error(y_test, y_test_pred)
    test_mse_list.append(test_mse)

# Plot MSE vs. Degree
plt.plot(degrees, test_mse_list, marker='o')
plt.xlabel("Polynomial Degree")
plt.ylabel("Root Mean Squared Error (Test Data)")
plt.title("RMSE on Test Data for Different Polynomial Degrees")
plt.show()