# Penalized parameter estimation

In [1]:
import os
import pyreadr
import time
import datetime
import folium
import numpy as np
import pandas as pd
import geopandas as gpd
import scipy.sparse as sp
from scipy.linalg import solve
from scipy.optimize import minimize
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d
import matplotlib.colors as mcolors
import statsmodels.api as sm
from folium.plugins import MarkerCluster
from adjustText import adjust_text
import matplotlib.dates as mdates
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from joblib import Parallel, delayed
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
starttime = pd.Timestamp.now()
#Import the cleaned data
data = pd.read_csv('PM10_BFF.csv')
Y = data.drop('Date', axis=1).values.T
data = pd.read_csv('PM10_BFF.csv')
dates = pd.to_datetime(data['Date'])
Y = data.drop('Date', axis=1).values.T
#Y = Y[:, :100]
#########################################
Time = Y.shape[1]
print('The number of hourly timepoints are',Time)
n = Y.shape[0]
print('The number of monitoring stations are',n)
print('The shape of the Y:', Y.shape) # Create an array to represent the time index
time_index = np.arange(Time)

#Create the sine and cosine components for temporal patterns with additional frequencies
sine_func_yearly = np.sin((2 * np.pi / (365 * 24))*time_index)
sine_func_semi_yearly = np.sin((2 * np.pi / (365 * 24/2))*time_index)
sine_func_monthly = np.sin((2 * np.pi / (365 * 24/12))*time_index)
sine_func_daily = np.sin((2 * np.pi / 24)*time_index)

########################################################################
cosine_func_yearly = np.cos((2 * np.pi / (365 * 24))* time_index)
cosine_func_semi_yearly = np.cos((2 * np.pi / (365 * 24/2))* time_index)
cosine_func_monthly = np.cos((2 * np.pi / (365 * 24/12))* time_index)
cosine_func_daily = np.cos((2 * np.pi / 24)* time_index)

######################################################################
sine_mean_func = (sine_func_yearly+sine_func_semi_yearly+sine_func_monthly+sine_func_daily)/5
cosine_mean_func =(cosine_func_yearly+cosine_func_semi_yearly+cosine_func_monthly+cosine_func_daily)/5
##############################################################################################
# Create a design matrix X with sine, cosine, and constant terms
#X = np.column_stack((np.ones(Time), sine_mean_func, cosine_mean_func))
X = np.column_stack((np.ones(Time),np.linspace(0, 1, Time), sine_func_yearly, sine_func_semi_yearly,sine_func_monthly,sine_func_daily,cosine_func_yearly,cosine_func_semi_yearly,cosine_func_monthly,cosine_func_daily)) # Single-station design matrix
X = np.tile(X[np.newaxis, :, :], (n, 1, 1)) #All stations
k = X.shape[2]
print('The shape of the design matrix X:', X.shape) #### dimension n * Time * 3
print(f"The value of k is {k} ")
########################################################################################
######################################################################################
# # Define the Log-Likelihood function
# def LL(parameters, Y, X, k,n, Time, lambda1, lambda2,lambda3):
#     beta = parameters[:k]
#     phi = parameters[k:k+n]
#     W = np.zeros((n, n))
#     W[np.triu_indices(n, 1)] = parameters[int(k+n):int(k+n+(0.5*n*(n-1)))]
#     W[np.tril_indices(n, -1)] = parameters[int(k+n+(0.5*n*(n-1))):int(k+(n*n))]
#     sigma2_eps = parameters[int(k+(n*n))]
    
#      # Calculate sum of squares
#     residuals_est = np.zeros(Time - 1)
#     for t in range(1, Time):
#         u_t = Y[:, t] - W @ Y[:, t] - (phi * Y[:, t-1]) - X[:, t] @ beta
#         residuals_est[t-1] = np.sum(u_t**2 / sigma2_eps)

#     sum_of_squares = np.sum(residuals_est)

#     # Calculate log-likelihood
#     Constant = -0.5 * (Time - 1) * (np.log(2 * np.pi) + np.sum(n * np.log(sigma2_eps))) + (Time - 1) * np.linalg.slogdet(np.eye(n) - W)[1]
#     LogLik = Constant - (0.5 * sum_of_squares) - (lambda1 * np.sum(np.abs(beta)) + lambda2 * np.sum(np.abs(phi))+ (lambda3 * np.sum(np.abs(W))))
#     return -LogLik
# #################################################################################################
# #################################################################################################
# def constraint_func(parameters):
#     W = np.zeros((n, n))
#     W[np.triu_indices(n, k=1)] = parameters[int(k+n):int(k+n+0.5*n*(n-1))]
#     W[np.tril_indices(n, k=-1)] = parameters[int(k+n+0.5*n*(n-1)):int(k+n+n*(n-1))]
#     row_sums = np.sum(W, axis=1)
#     return 1 - row_sums# Constraint: row_sums <= 1
# #################################################################################################
# #Define the bounds for the optimization
# lb = np.concatenate(([-np.inf]*k, [0]*n, [0]*(int(n*(n-1))), [0.00001]))
# ub = np.concatenate(([np.inf]*k, [1]*n, [1]*(int(n*(n-1))), [np.inf]))
# bounds = list(zip(lb, ub))
# # initialize the parameters
# param = np.concatenate((np.repeat(0, k), np.repeat(0.01, n), np.repeat(0.001, int(n*(n-1))), [1]))
# print("Full Parameter Estimation has started")
# starttime = pd.Timestamp.now()
# # Function to find the best lambda combination
# result = minimize(LL, param, args=(Y, X, k,n, Time, 0, 0,0), bounds=list(zip(lb, ub)))
# endtime = pd.Timestamp.now()
# print("Full Parameter Estimation has ended in ", endtime - starttime)

# # Extract the optimized parameter values
# parameters_opt = result.x
# np.savetxt('optimised_parameter_X_10.txt', parameters_opt)

The number of hourly timepoints are 136923
The number of monitoring stations are 26
The shape of the Y: (26, 136923)
The shape of the design matrix X: (26, 136923, 10)
The value of k is 10 


# Unpenalized parameter Estimation

In [2]:
import os
import pyreadr
import time
import datetime
import folium
import numpy as np
import pandas as pd
import geopandas as gpd
import scipy.sparse as sp
from scipy.linalg import solve, inv
from scipy.optimize import minimize,approx_fprime 
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d
import matplotlib.colors as mcolors
import statsmodels.api as sm
from folium.plugins import MarkerCluster
from adjustText import adjust_text
import matplotlib.dates as mdates
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from joblib import Parallel, delayed
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
starttime = pd.Timestamp.now()
#Import the cleaned data
data = pd.read_csv('PM10_BFF.csv')
Y = data.drop('Date', axis=1).values.T
data = pd.read_csv('PM10_BFF.csv')
dates = pd.to_datetime(data['Date'])
Y = data.drop('Date', axis=1).values.T
#Y = Y[:, :100]
#########################################
Time = Y.shape[1]
print('The number of hourly timepoints are',Time)
n = Y.shape[0]
print('The number of monitoring stations are',n)
print('The shape of the Y:', Y.shape) # Create an array to represent the time index
time_index = np.arange(Time)

#Create the sine and cosine components for temporal patterns with additional frequencies
sine_func_yearly = np.sin((2 * np.pi / (365 * 24))*time_index)
sine_func_semi_yearly = np.sin((2 * np.pi / (365 * 24/2))*time_index)
sine_func_monthly = np.sin((2 * np.pi / (365 * 24/12))*time_index)
sine_func_daily = np.sin((2 * np.pi / 24)*time_index)

########################################################################
cosine_func_yearly = np.cos((2 * np.pi / (365 * 24))* time_index)
cosine_func_semi_yearly = np.cos((2 * np.pi / (365 * 24/2))* time_index)
cosine_func_monthly = np.cos((2 * np.pi / (365 * 24/12))* time_index)
cosine_func_daily = np.cos((2 * np.pi / 24)* time_index)

######################################################################
sine_mean_func = (sine_func_yearly+sine_func_semi_yearly+sine_func_monthly+sine_func_daily)/5
cosine_mean_func =(cosine_func_yearly+cosine_func_semi_yearly+cosine_func_monthly+cosine_func_daily)/5
##############################################################################################
# Create a design matrix X with sine, cosine, and constant terms
#X = np.column_stack((np.ones(Time), sine_mean_func, cosine_mean_func))
X = np.column_stack((np.ones(Time),np.linspace(0, 1, Time), sine_func_yearly, sine_func_semi_yearly,sine_func_monthly,sine_func_daily,cosine_func_yearly,cosine_func_semi_yearly,cosine_func_monthly,cosine_func_daily)) # Single-station design matrix
X = np.tile(X[np.newaxis, :, :], (n, 1, 1)) #All stations
k = X.shape[2]
print('The shape of the design matrix X:', X.shape) #### dimension n * Time * 3
print(f"The value of k is {k} ")

# Define Log-Likelihood function without penalization
def LL_unpenalized(parameters, Y, X,k,n, active_indices, Time):
    full_params = np.zeros(k + n + n*(n-1) + 1)
    full_params[active_indices] = parameters  # Restore only active parameters
    beta = full_params[:k]
    phi = full_params[k:k+n]
    W = np.zeros((n, n))
    W[np.triu_indices(n, 1)] = full_params[int(k+n):int(k+n+(0.5*n*(n-1)))]
    W[np.tril_indices(n, -1)] = full_params[int(k+n+(0.5*n*(n-1))):int(k+(n*n))]
    sigma2_eps = full_params[-1]
    
    residuals_est = np.zeros(Time - 1)
    for t in range(1, Time):
        u_t = Y[:, t] - W @ Y[:, t] - (phi * Y[:, t-1]) - X[:, t] @ beta
        residuals_est[t-1] = np.sum(u_t**2 / sigma2_eps)
    
    sum_of_squares = np.sum(residuals_est)
    # Calculate log-likelihood
    Constant = -0.5 * (Time - 1) * (np.log(2 * np.pi) + np.sum(n * np.log(sigma2_eps))) + (Time - 1) * np.linalg.slogdet(np.eye(n) - W)[1]
    LogLik = Constant - (0.5 * sum_of_squares)
    return -LogLik  # Negative for minimization

# Load LASSO results
parameters_opt = np.loadtxt('optimised_parameter_X_10.txt')
zero_indices = np.where(parameters_opt == 0)[0]
active_indices = np.where(parameters_opt != 0)[0]

# Optimize only for nonzero parameters
lb = np.concatenate(([-np.inf]*k, [0]*n, [0]*(int(n*(n-1))), [0.00001]))
ub = np.concatenate(([np.inf]*k, [1]*n, [1]*(int(n*(n-1))), [np.inf]))
bounds = list(zip(lb, ub))
active_bounds = [bounds[i] for i in active_indices] # Select bounds for only active parameters
init_params = parameters_opt[active_indices]
print("Parameter Estimation for the unpenalized model has begun")
#result = minimize(LL_unpenalized, init_params, args=(Y, X,k,n, active_indices, Time), bounds=active_bounds)
result = minimize(LL_unpenalized, init_params, args=(Y, X, k, n, active_indices,Time), method='SLSQP', bounds=active_bounds)#, constraints={'type': 'ineq', 'fun': lambda x: constraint_func(x, n, k)})
final_params = np.zeros_like(parameters_opt)
final_params[active_indices] = result.x
print("Parameter Estimation for the unpenalized model has ended")
np.savetxt('unpenalized_parameters.txt', final_params)

# Convert hess_inv to a dense matrix
if isinstance(result.hess_inv, np.ndarray):
    inv_hess = result.hess_inv  # Use directly if already a NumPy array
else:
    inv_hess = result.hess_inv.todense()  # Convert L-BFGS inverse Hessian to a full matrix

# Compute standard errors
std_errors = np.sqrt(np.diag(inv_hess))

# Assign standard errors only to active parameters
final_std_errors = np.zeros_like(parameters_opt)
final_std_errors[active_indices] = std_errors

# Save results as a DataFrame
df_results = pd.DataFrame({'Parameter': final_params, 'Standard Error': final_std_errors})
df_results.to_csv('parameter_standard_errors.csv', index=False)

print("Optimization complete. Parameters and standard errors saved as a table.")

The number of hourly timepoints are 136923
The number of monitoring stations are 26
The shape of the Y: (26, 136923)
The shape of the design matrix X: (26, 136923, 10)
The value of k is 10 
Parameter Estimation for the unpenalized model has begun
Parameter Estimation for the unpenalized model has ended


AttributeError: hess_inv

In [None]:
import os
import pyreadr
import time
import datetime
import folium
import numpy as np
import pandas as pd
import geopandas as gpd
import scipy.sparse as sp
from scipy.linalg import solve, inv
from scipy.optimize import minimize,approx_fprime 
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d
import matplotlib.colors as mcolors
import statsmodels.api as sm
from folium.plugins import MarkerCluster
from adjustText import adjust_text
import matplotlib.dates as mdates
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from joblib import Parallel, delayed
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
starttime = pd.Timestamp.now()
#Import the cleaned data
data = pd.read_csv('PM10_BFF.csv')
Y = data.drop('Date', axis=1).values.T
data = pd.read_csv('PM10_BFF.csv')
dates = pd.to_datetime(data['Date'])
Y = data.drop('Date', axis=1).values.T
#Y = Y[:, :100]
#########################################
Time = Y.shape[1]
print('The number of hourly timepoints are',Time)
n = Y.shape[0]
print('The number of monitoring stations are',n)
print('The shape of the Y:', Y.shape) # Create an array to represent the time index
time_index = np.arange(Time)

#Create the sine and cosine components for temporal patterns with additional frequencies
sine_func_yearly = np.sin((2 * np.pi / (365 * 24))*time_index)
sine_func_semi_yearly = np.sin((2 * np.pi / (365 * 24/2))*time_index)
sine_func_monthly = np.sin((2 * np.pi / (365 * 24/12))*time_index)
sine_func_daily = np.sin((2 * np.pi / 24)*time_index)

########################################################################
cosine_func_yearly = np.cos((2 * np.pi / (365 * 24))* time_index)
cosine_func_semi_yearly = np.cos((2 * np.pi / (365 * 24/2))* time_index)
cosine_func_monthly = np.cos((2 * np.pi / (365 * 24/12))* time_index)
cosine_func_daily = np.cos((2 * np.pi / 24)* time_index)

######################################################################
sine_mean_func = (sine_func_yearly+sine_func_semi_yearly+sine_func_monthly+sine_func_daily)/5
cosine_mean_func =(cosine_func_yearly+cosine_func_semi_yearly+cosine_func_monthly+cosine_func_daily)/5
##############################################################################################
# Create a design matrix X with sine, cosine, and constant terms
#X = np.column_stack((np.ones(Time), sine_mean_func, cosine_mean_func))
X = np.column_stack((np.ones(Time),np.linspace(0, 1, Time), sine_func_yearly, sine_func_semi_yearly,sine_func_monthly,sine_func_daily,cosine_func_yearly,cosine_func_semi_yearly,cosine_func_monthly,cosine_func_daily)) # Single-station design matrix
X = np.tile(X[np.newaxis, :, :], (n, 1, 1)) #All stations
k = X.shape[2]
print('The shape of the design matrix X:', X.shape) #### dimension n * Time * 3
print(f"The value of k is {k} ")

# Define Log-Likelihood function without penalization
def LL_unpenalized(parameters, Y, X,k,n, active_indices, Time):
    full_params = np.zeros(k + n + n*(n-1) + 1)
    full_params[active_indices] = parameters  # Restore only active parameters
    beta = full_params[:k]
    phi = full_params[k:k+n]
    W = np.zeros((n, n))
    W[np.triu_indices(n, 1)] = full_params[int(k+n):int(k+n+(0.5*n*(n-1)))]
    W[np.tril_indices(n, -1)] = full_params[int(k+n+(0.5*n*(n-1))):int(k+(n*n))]
    sigma2_eps = full_params[-1]
    
    residuals_est = np.zeros(Time - 1)
    for t in range(1, Time):
        u_t = Y[:, t] - W @ Y[:, t] - (phi * Y[:, t-1]) - X[:, t] @ beta
        residuals_est[t-1] = np.sum(u_t**2 / sigma2_eps)
    
    sum_of_squares = np.sum(residuals_est)
    # Calculate log-likelihood
    Constant = -0.5 * (Time - 1) * (np.log(2 * np.pi) + np.sum(n * np.log(sigma2_eps))) + (Time - 1) * np.linalg.slogdet(np.eye(n) - W)[1]
    LogLik = Constant - (0.5 * sum_of_squares)
    return -LogLik  # Negative for minimization

# Load LASSO results
parameters_opt = np.loadtxt('AICoptimised_parameter.txt')
zero_indices = np.where(parameters_opt == 0)[0]
active_indices = np.where(parameters_opt != 0)[0]

# Optimize only for nonzero parameters
lb = np.concatenate(([-np.inf]*k, [0]*n, [0]*(int(n*(n-1))), [0.00001]))
ub = np.concatenate(([np.inf]*k, [1]*n, [1]*(int(n*(n-1))), [np.inf]))
bounds = list(zip(lb, ub))
active_bounds = [bounds[i] for i in active_indices] # Select bounds for only active parameters
init_params = parameters_opt[active_indices]
print("Parameter Estimation for the unpenalized model has begun")
#result = minimize(LL_unpenalized, init_params, args=(Y, X,k,n, active_indices, Time), bounds=active_bounds)
result = minimize(LL_unpenalized, init_params, args=(Y, X, k, n, active_indices,Time), method='SLSQP', bounds=active_bounds)#, constraints={'type': 'ineq', 'fun': lambda x: constraint_func(x, n, k)})
final_params = np.zeros_like(parameters_opt)
final_params[active_indices] = result.x
print("Parameter Estimation for the unpenalized model has ended")
np.savetxt('unpenalized_parameters.txt', final_params)

# Compute numerical Hessian using finite differences
def compute_numerical_hessian(func, params, epsilon=1e-5, *args):
    n = len(params)
    hessian = np.zeros((n, n))
    perturb = np.eye(n) * epsilon
    
    for i in range(n):
        for j in range(n):
            f_pp = func(params + perturb[i] + perturb[j], *args)  # f(x_i + ε, x_j + ε)
            f_pm = func(params + perturb[i] - perturb[j], *args)  # f(x_i + ε, x_j - ε)
            f_mp = func(params - perturb[i] + perturb[j], *args)  # f(x_i - ε, x_j + ε)
            f_mm = func(params - perturb[i] - perturb[j], *args)  # f(x_i - ε, x_j - ε)
            
            hessian[i, j] = (f_pp - f_pm - f_mp + f_mm) / (4 * epsilon**2)

    return hessian
# Compute Hessian numerically at optimized parameters
hessian = compute_numerical_hessian(LL_unpenalized, result.x, 1e-5, Y, X, k, n, active_indices, Time)

# Compute inverse Hessian (use pseudo-inverse in case of singularity)
inv_hessian = pinv(hessian)

# Compute standard errors
std_errors = np.sqrt(np.diag(inv_hessian))

# Compute z-values
z_values = result.x / std_errors

# Store results in a DataFrame
df_results = pd.DataFrame({'Parameter': result.x, 'Standard Error': std_errors, 'Z-Value': z_values})

# Save to CSV
df_results.to_csv('parameter_standard_errors.csv', index=False)

print("Optimization complete. Parameters, standard errors, and z-values saved as a table.")

The number of hourly timepoints are 136923
The number of monitoring stations are 26
The shape of the Y: (26, 136923)
The shape of the design matrix X: (26, 136923, 10)
The value of k is 10 
Parameter Estimation for the unpenalized model has begun
