Part 1

In [80]:
# ============================================================
# Imports
# ============================================================

import pandas as pd
import numpy as np
import kagglehub

from kagglehub import KaggleDatasetAdapter
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import matplotlib.pyplot as plt

In [81]:
# ============================================================
# Load dataset
# ============================================================

DATA_PATH1 = "FuelEconomy.csv"
df_fuel = pd.read_csv(DATA_PATH1)

print("Shape:", df_fuel.shape)
print("\nColumns:")
print(df_fuel.columns.tolist())

display(df_fuel.head())

print("\nSummary statistics:")
display(df_fuel.describe(include="all"))

print("\nMissing values per column:")
display(df_fuel.isna().sum())

Shape: (100, 2)

Columns:
['Horse Power', 'Fuel Economy (MPG)']


Unnamed: 0,Horse Power,Fuel Economy (MPG)
0,118.770799,29.344195
1,176.326567,24.695934
2,219.262465,23.95201
3,187.310009,23.384546
4,218.59434,23.426739



Summary statistics:


Unnamed: 0,Horse Power,Fuel Economy (MPG)
count,100.0,100.0
mean,213.67619,23.178501
std,62.061726,4.701666
min,50.0,10.0
25%,174.996514,20.439516
50%,218.928402,23.143192
75%,251.706476,26.089933
max,350.0,35.0



Missing values per column:


Unnamed: 0,0
Horse Power,0
Fuel Economy (MPG),0


In [82]:
def prepare_xy(df_in, in_col, out_col):
  # Create a clean DataFrame by dropping rows with any missing values
  df_clean = df_in.dropna().copy()
  X = df_clean[[out_col]]
  y = df_clean[in_col]
  return X, y

def split_data(X, y, test_size=0.3, random_state=42):
    # Split the data into training and testing sets
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

def compute_metrics(y_true, y_pred):
    # Calculate Mean Squared Error (MSE)
    mse = mean_squared_error(y_true, y_pred)
    # Calculate Mean Absolute Error (MAE)
    mae = mean_absolute_error(y_true, y_pred)
    # Calculate R-squared (R^2)
    r2 = r2_score(y_true, y_pred)
    return {"MSE": mse, "MAE": mae, "R^2": r2}

def run_models_and_evaluate(df_in, in_col, out_col, degrees=(1,2,3,4),
                            test_size=0.3, random_state=42, top_k_terms=15):
  # Prepare features (X) and target (y) for modeling
  X, y = prepare_xy(df_in, in_col, out_col)
  # Split data into training and testing sets
  X_train, X_test, y_train, y_test = split_data(X, y, test_size, random_state)

  rows = []

  # Iterate through different polynomial degrees for model training
  for deg in degrees:
    if deg == 1:
      # For degree 1, use simple Linear Regression
      model = LinearRegression()
      model_name = "Linear Regression"
    else:
      # For higher degrees, use Polynomial Features within a Pipeline
      model = Pipeline([
          ('poly', PolynomialFeatures(degree=deg, include_bias=False)), # Create polynomial features
          ('linear', LinearRegression()) # Apply Linear Regression to polynomial features
      ])
      model_name = f"Polynomial Regression (degree={deg})"

    # Fit the model to the training data
    model.fit(X_train, y_train)

    # Make predictions on training and test sets
    yhat_train = model.predict(X_train)
    yhat_test = model.predict(X_test)

    # Compute evaluation metrics for training and test predictions
    train_m = compute_metrics(y_train, yhat_train)
    test_m = compute_metrics(y_test, yhat_test)

    # Store results for the current model
    rows.append({
            "Model": model_name,
            "Train MSE": train_m["MSE"],
            "Train MAE": train_m["MAE"],
            "Train R^2": train_m["R^2"],
            "Test MSE": test_m["MSE"],
            "Test MAE": test_m["MAE"],
            "Test R^2": test_m["R^2"],
            "Train size": len(X_train),
            "Test size": len(X_test),
        })

  # Convert the list of results to a DataFrame and return
  return pd.DataFrame(rows)


In [83]:
results_A = run_models_and_evaluate(df_fuel, 'Fuel Economy (MPG)', 'Horse Power')
display(results_A)

Unnamed: 0,Model,Train MSE,Train MAE,Train R^2,Test MSE,Test MAE,Test R^2,Train size,Test size
0,Linear Regression,2.115741,1.209978,0.90632,1.67495,1.031271,0.913315,70,30
1,Polynomial Regression (degree=2),2.11507,1.210303,0.90635,1.657031,1.025411,0.914243,70,30
2,Polynomial Regression (degree=3),2.06055,1.211527,0.908764,1.903743,1.087196,0.901475,70,30
3,Polynomial Regression (degree=4),1.917714,1.168259,0.915088,2.54846,1.203406,0.868108,70,30




*   The second degree polynomial regression performs the best in the test set because it has the highest $R^2$ value of 0.914243.

*   In this case, the second degree polynomial regression performs better than the linear regression since the $R^2$ value of 0.914243 < 0.913315, but the third degree performs worse than the linear regression, and the fourth degree performs the worst.

*   One reason the model performs poorly is because it's hard to accurately predict the horse power of a car only using the fuel efficiency. Another reason is due to overfitting, which can be seen as the $R^2$ value for the train set is much higher than the test set in the fourth degree regression. Higher degree regressions are more susceptible to overfitting since they are more flexible.


Part 2

In [84]:
# ============================================================
# Load dataset
# ============================================================

DATA_PATH2 = "electricity_consumption_based_weather_dataset.csv"
df_weather = pd.read_csv(DATA_PATH2)

print("Shape:", df_weather.shape)
print("\nColumns:")
print(df_weather.columns.tolist())

display(df_weather.head())

print("\nSummary statistics:")
display(df_weather.describe(include="all"))

print("\nMissing values per column:")
display(df_weather.isna().sum())

Shape: (1433, 6)

Columns:
['date', 'AWND', 'PRCP', 'TMAX', 'TMIN', 'daily_consumption']


Unnamed: 0,date,AWND,PRCP,TMAX,TMIN,daily_consumption
0,2006-12-16,2.5,0.0,10.6,5.0,1209.176
1,2006-12-17,2.6,0.0,13.3,5.6,3390.46
2,2006-12-18,2.4,0.0,15.0,6.7,2203.826
3,2006-12-19,2.4,0.0,7.2,2.2,1666.194
4,2006-12-20,2.4,0.0,7.2,1.1,2225.748



Summary statistics:


Unnamed: 0,date,AWND,PRCP,TMAX,TMIN,daily_consumption
count,1433,1418.0,1433.0,1433.0,1433.0,1433.0
unique,1433,,,,,
top,2010-11-26,,,,,
freq,1,,,,,
mean,,2.642313,3.800488,17.187509,9.141242,1561.078061
std,,1.140021,10.973436,10.136415,9.028417,606.819667
min,,0.0,0.0,-8.9,-14.4,14.218
25%,,1.8,0.0,8.9,2.2,1165.7
50%,,2.4,0.0,17.8,9.4,1542.65
75%,,3.3,1.3,26.1,17.2,1893.608



Missing values per column:


Unnamed: 0,0
date,0
AWND,15
PRCP,0
TMAX,0
TMIN,0
daily_consumption,0


In [85]:
def prepare_xy2(df_in, target_col, exclude_col):
  # Create a clean DataFrame by dropping rows with any missing values
  df_clean = df_in.dropna().copy()
  # Extract features (X) by dropping the target and specified exclusion columns
  X = df_clean.drop(columns=[target_col, exclude_col])
  # Extract target variable (y)
  y = df_clean[target_col]
  return X, y

def run_models_and_evaluate2(df_in, target_col, exclude_col, degrees=(1,2,3,4),
                            test_size=0.3, random_state=42, top_k_terms=15):
  # Prepare features (X) and target (y)
  X, y = prepare_xy2(df_in, target_col, exclude_col)
  # Split data into training and testing sets
  X_train, X_test, y_train, y_test = split_data(X, y, test_size, random_state)

  rows = []

  # Iterate through different polynomial degrees for model training
  for deg in degrees:
    if deg == 1:
      # For degree 1, use simple Linear Regression
      model = LinearRegression()
      model_name = "Linear Regression"
    else:
      # For higher degrees, use Polynomial Features within a Pipeline
      model = Pipeline([
          ('poly', PolynomialFeatures(degree=deg, include_bias=False)), # Create polynomial features
          ('linear', LinearRegression()) # Apply Linear Regression to polynomial features
      ])
      model_name = f"Polynomial Regression (degree={deg})"

    # Fit the model to the training data
    model.fit(X_train, y_train)

    # Make predictions on training and test sets
    yhat_train = model.predict(X_train)
    yhat_test = model.predict(X_test)

    # Compute evaluation metrics for training and test predictions
    train_m = compute_metrics(y_train, yhat_train)
    test_m = compute_metrics(y_test, yhat_test)

    # Store results for the current model
    rows.append({
            "Model": model_name,
            "Train MSE": train_m["MSE"],
            "Train MAE": train_m["MAE"],
            "Train R^2": train_m["R^2"],
            "Test MSE": test_m["MSE"],
            "Test MAE": test_m["MAE"],
            "Test R^2": test_m["R^2"],
            "Train size": len(X_train),
            "Test size": len(X_test),
        })

  # Convert the list of results to a DataFrame and return
  return pd.DataFrame(rows)

In [86]:
# Exclude the date column
results_B = run_models_and_evaluate2(df_weather, 'daily_consumption', 'date')
display(results_B)

Unnamed: 0,Model,Train MSE,Train MAE,Train R^2,Test MSE,Test MAE,Test R^2,Train size,Test size
0,Linear Regression,272403.396174,384.465016,0.276,248125.8,375.404537,0.299333,992,426
1,Polynomial Regression (degree=2),264765.769932,379.648753,0.2963,255268.5,379.039083,0.279163,992,426
2,Polynomial Regression (degree=3),259249.53487,375.952901,0.310961,265623.7,385.235167,0.249922,992,426
3,Polynomial Regression (degree=4),251909.339001,372.116566,0.33047,12151490.0,578.642201,-33.313844,992,426




*   The linear regression model had the best performance in the test set ($R^2$ = 0.299333), which indicates the relationship between the weather and electricity consumption is weak since the $R^2$ value was so low, and weather alone is a poor predictor of electricity consumption.
*   Increasing the polynomial degree doesn't improve the test performance. It makes the performance worse, and at fourth degree the performance is extremely bad.
*   The higher degree polynomial regressions perform worse due to overfitting. The train $R^2$ value improves as the degree increases, but the test $R^2$ decreases as the degree increases, with fourth degree crashing to -33.313844. The train MSE decreases as the degree increases, while the test MSE increases, which shows the model was too complex.
*   None of the models performed good performance. One reason is that the weather data included in the dataset is too limited and is missing important pieces of information such as what season it is. Another reason is that there are other factors that contribute to electricity consumption that isn't accounted for just by the weather data such as building occupancy and daily activity.
