In [15]:
################################################################################
### Python port of help_functions.R
### https://github.com/cran/hdm/blob/master/R/help_functions.R
################################################################################

################################################################################
### 1: Load modules
################################################################################

# Standard Python modules
import numpy as np
from sklearn.linear_model import LinearRegression as lm

################################################################################
### 2: Define functions
################################################################################

################################################################################
### 2.1: Functions which are not in the original R package
###      These are generally helper functions to allow an implementation which
###      reads as closely to the original R code as possible, and to ease a
###      Python implementation
################################################################################


# Define a function which turn a list or vector-like object into a proper two
# dimensional column vector
def cvec(a):
    """ Turn a list or vector-like object into a proper column vector
    Input
    a: List or vector-like object, has to be a potential input for np.array()
    Output
    vec: two dimensional NumPy array, with the first dimension weakly greater
         than the second (resulting in a column vector for a vector-like input)
    """
    # Conver input into a two dimensional NumPy array
    vec = np.array(a, ndmin=2)

    # Check whether the second dimension is strictly greater than the first
    # (remembering Python's zero indexing)
    if vec.shape[0] < vec.shape[1]:
        # If so, transpose the input vector
        vec = vec.T

    # Return the column vector
    return vec


# Define a function to mimic R's cor() function, which can take two matrices and
# return the correlation coefficients between the columns of the first and the
# columns of the second matrix
def cor(y, X):
    """ Return correlation coefficients between columns of matrices
    Inputs
    y: n by 1 NumPy array
    X: n by k NumPy array
    Outputs
    corr: list of length k, where the k-th element is the correlation
          coefficient between y and the k-th column of X
    """
    # Concatenate y and X into a single NumPy array
    yX = np.concatenate([y, X], axis=1)

    # Get the correlation coefficients between all columns of that array
    corr = np.corrcoef(yX, rowvar=False)

    # Get the first row, starting at the first off-diagonal element (these are
    # the correlation coefficients between y and each column of X
    corr = corr[0,1:]

    # Return the result
    return corr

In [16]:
################################################################################
### 2.2: Functions which are in the original R package
################################################################################


# Define a function which returns initial parameter guesses
def init_values(X, y, number=5, intercept=True):
    """ Return an initial parameter guess for a LASSO model
    Inputs
    y: n by 1 NumPy array, outcome variable
    X: n by k NumPy array, RHS variables
    Outputs
    residuals: n ny 1 NumPy array, residuals for initial parameter guess
    coefficients: k by 1 NumPy array, initial coefficient values
    """
    # Make sure y is a proper column vector
    y = cvec(y)

    # Get the absolute value of correlations between y and X
    corr = np.abs(cor(y, X))

    # Get the number of columns of X
    kx = X.shape[1]

    # Make an index selecting the five columns of X which are most correlated
    # with y (since .argsort() always sorts in increasing order, selecting from
    # the back gets the most highly correlated columns)
    index = corr.argsort()[-np.amin([number, kx]):]

    # Set up an array of coefficient guesses
    coefficients = np.zeros(shape=(kx, 1))

    # Regress y on the five most correlated columns of X, including an intercept
    # if desired
    reg = lm(fit_intercept=intercept).fit(X[:, index], y)

    # Replace the guesses for the estimated coefficients (note that .coef_ does
    # not return the estimated intercept, if one was included in the model)
    coefficients[index, :] = reg.coef_.T

    # Replace any NANs as zeros
    coefficients[np.isnan(coefficients)] = 0

    # Get the regression residuals
    residuals = y - reg.predict(X[:, index])

    # Return the residuals and coefficients
    return {'residuals': residuals, 'coefficients': coefficients, 'index': index }

In [17]:
################################################################################
### Python port of LassoShooting.fit.R
### https://github.com/cran/hdm/blob/master/R/LassoShooting.fit.R
################################################################################

################################################################################
### 1: Load modules
################################################################################

# Standard Python modules
import numpy as np

# Other parts of hdmpy
# from hdmpy.help_functions import cvec, init_values

################################################################################
### 2: Define function
################################################################################

# Define shooting LASSO with variable dependent penalty terms
def LassoShooting_fit(x, y, lmbda, maxIter=1000, optTol=10**(-5),
                      zeroThreshold=10**(-6), XX=None, Xy=None,
                      beta_start=None):
    """ Shooting LASSO algorithm with variable dependent penalty weights

    Inputs
    x: n by p NumPy array, RHS variables
    y: n by 1 NumPy array, outcome variable
    lmbda: p by 1 NumPy array, variable dependent penalty terms. The j-th
           element is the penalty term for the j-th RHS variable.
    maxIter: integer, maximum number of shooting LASSO updated
    optTol: scalar, algorithm terminated once the sum of absolute differences
            between the updated and current weights is below optTol
    zeroThreshold: scalar, if any final weights are below zeroThreshold, they
                   will be set to zero instead
    XX: k by k NumPy array, pre-calculated version of x'x
    Xy: k by 1 NumPy array, pre-calculated version of x'y
    beta_start: k by 1 NumPy array, initial weights

    Outputs
    w: k by 1 NumPy array, final weights
    wp: k by m + 1 NumPy array, where m is the number of iterations the
        algorithm took. History of weight updates, starting with the initial
        weights.
    m: integer, number of iterations the algorithm took
    """
    # Make sure that y and lmbda are proper column vectors
    y = cvec(y)
    lmbda = cvec(lmbda)

    # Get number of observations n and number of variables p
    n, p = x.shape

    # Check whether XX and Xy were provided, calculate them if not
    if XX is None:
        XX = x.T @ x
    if Xy is None:
        Xy = x.T @ y

    # Check whether an initial value for the intercept was provided
    if beta_start is None:
        # If not, use init_values from help_functions, which will return
        # regression estimates for the five variables in x which are most
        # correlated with y, and initialize all other coefficients as zero
        beta = init_values(x, y, intercept=False)['coefficients']
    else:
        # Otherwise, use the provided initial weights
        beta = beta_start

    # Set up a history of weights over time, starting with the initial ones
    wp = beta

    # Keep track of the number of iterations
    m = 1

    # Create versions of XX and Xy which are just those matrices times two
    XX2 = XX * 2
    Xy2 = Xy * 2

    # Go through all iterations
    while m < maxIter:
        # Save the last set of weights (the .copy() is important, otherwise
        # beta_old will be updated every time beta is changed during the
        # following loop)
        beta_old = beta.copy()

        # Go through all parameters
        for j in np.arange(p):
            # Calculate the shoot
            S0 = XX2[j,:] @ beta - XX2[j,j] * beta[j,0] - Xy2[j,0]

            # Update the weights
            if np.isnan(S0).sum() >= 1:
                beta[j] = 0
            elif S0 > lmbda[j]:
                beta[j] = (lmbda[j] - S0) / XX2[j,j]
            elif S0 < -lmbda[j]:
                beta[j] = (-lmbda[j] - S0) / XX2[j,j]
            elif np.abs(S0) <= lmbda[j]:
                beta[j] = 0

        # Add the updated weights to the history of weights
        wp = np.concatenate([wp, beta], axis=1)

        # Check whether the weights are within tolerance
        if np.abs(beta - beta_old).sum() < optTol:
            # If so, break the while loop
            break

        # Increase the iteration counter
        m = m + 1

    # Set the final weights to the last updated weights
    w = beta

    # Set weights which are within zeroThreshold to zero
    w[np.abs(w) < zeroThreshold] = 0

    # Return the weights, history of weights, and iteration counter
    return {'coefficients': w, 'coef.list': wp, 'num.it': m}

# Test 1 

In [36]:
import numpy as np
import random
import statsmodels.api as sm
# Set Seed
random.seed(10)
print(random.random())

n = 10
p = n//2
X = np.random.normal(0, 1, size=(n, p))
Y = np.random.normal(0, 1,n)
beta = np.random.normal(0, 1,n)
lmbda = np.random.normal(0, 1,n)

0.5714025946899135


In [37]:
LassoShooting_fit(X, Y, lmbda)["coefficients"]

array([[ 0.77361731],
       [-0.95770003],
       [ 0.        ],
       [-0.93329414],
       [ 0.        ]])

# Test 2 

In [74]:
# Import relevant packages
import pandas as pd
import numpy as np
import pyreadr
rdata_read = pyreadr.read_r("../../data/wage2015_subsample_inference.Rdata")
data = rdata_read[ 'data' ]
type(data)
data.shape

(5150, 20)

In [75]:
Y_2 = data['lwage'].to_numpy()

In [76]:
Y_2 = data['lwage'].to_numpy()
n = len(Y)
X = data.loc[:, ~data.columns.isin(['wage', 'lwage','Unnamed: 0'])].to_numpy(dtype = float)
X_2 = X[:, 0:5]

list1 = [0.1, 0.2, 0.3, 0.4, 0.5]
lmbda_2 = np.array(list1)
#lmbda = np.random.normal(0, 1, X_2.shape[1])

X_2

array([[1., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       ...,
       [0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [77]:
lmbda

array([0.1, 0.2, 0.3, 0.4, 0.5])

In [78]:
LassoShooting_fit(X_2, Y_2, lmbda_2)["coefficients"]

array([[0.82252356],
       [2.43022504],
       [2.45580171],
       [2.47384323],
       [2.73381929]])

In [49]:
init_values(X, Y, number=5, intercept=True)["index"]

array([ 3,  4,  2,  5, 15], dtype=int64)