In [None]:
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', None)

In [None]:
# utility functions
def process_res_df(df):
    df['fastest_alg'] = df.apply(lambda row: fastest_alg(row[['runtime.glmnet', 'runtime.ncvreg', 'runtime.lasso']]), axis=1)
    df['lasso_glmnet_runtime_ratio'] = df['runtime.lasso'] / df['runtime.glmnet']
    df['n_p_ratio'] = df['n'] / df['p']
    df['p0_p_ratio'] = df['p0'] / df['p']
    df['runtime_acceptable'] = (df['lasso_glmnet_runtime_ratio'] < 1.5) | (df['runtime.lasso'] < 2)
    return df
    
def fastest_alg(t_triple):
    t_triple = list(t_triple)
    return ['glmnet', 'ncvreg', 'lasso'][t_triple.index(min(t_triple))]

def analyze_measure_by_factor(df, measure, factor, plot=True):
    assert (measure in df.columns) & (factor in df.columns)
    print(f"Mean of {measure} by {factor}: \n{df.groupby([factor])[measure].mean()}")
    print(f"\nStd of {measure} by {factor}: \n{df.groupby([factor])[measure].std()}")
    if plot:
        ax = sns.histplot(data=df, x=measure, hue=factor, palette="Set2")
        if "ratio" in measure:
            ax.axvline(1, color="red")
        plt.show()

# Tests 10

In [None]:
df = pd.read_csv("tests10.csv")
df = process_res_df(df)
print(Counter(df['fastest_alg']))
df

In [None]:
ax = sns.histplot(data=df, x="lasso_glmnet_runtime_ratio", color='gray')
ax.axvline(1, color='red')
plt.show()

In [None]:
analyze_measure_by_factor(df, "runtime_acceptable", "snr", plot=False)
analyze_measure_by_factor(df, "runtime_acceptable", "n_p_ratio", plot=False)
analyze_measure_by_factor(df, "runtime_acceptable", "p0_p_ratio", plot=False)

In [None]:
analyze_measure_by_factor(df, "lasso_glmnet_runtime_ratio", "snr", plot=False)
analyze_measure_by_factor(df, "lasso_glmnet_runtime_ratio", "n_p_ratio", plot=False)
analyze_measure_by_factor(df, "obj.lasso", "snr", plot=False)
analyze_measure_by_factor(df, "obj.lasso", "n_p_ratio", plot=False)

In [None]:

obj_threshold = 1e-4
print(len(df))
print(f"Num of obj.lasso - obj.glmnet > threshold: {sum(df['obj.lasso-obj.glmnet'] > obj_threshold)}")
print(f"Num of obj.lasso - obj.ncvreg > threshold: {sum(df['obj.lasso-obj.ncvreg'] > obj_threshold)}")
print(f"Num of obj.lasso - obj.ncvreg > threshold && obj.lasso - obj.glmnet > threshold: {sum((df['obj.lasso-obj.ncvreg'] > obj_threshold) & (df['obj.lasso-obj.ncvreg'] > obj_threshold))}")

In [None]:

obj_threshold = 0
print(len(df))
print(f"Num of obj.lasso - obj.glmnet > threshold: {sum(df['obj.lasso-obj.glmnet'] > obj_threshold)}")
print(f"Num of obj.lasso - obj.ncvreg > threshold: {sum(df['obj.lasso-obj.ncvreg'] > obj_threshold)}")
print(f"Num of obj.lasso - obj.ncvreg > threshold && obj.lasso - obj.glmnet > threshold: {sum((df['obj.lasso-obj.ncvreg'] > obj_threshold) & (df['obj.lasso-obj.ncvreg'] > obj_threshold))}")

In [None]:

obj_threshold = 0
temp_df = df[df['obj.lasso-obj.glmnet'] > obj_threshold]
print(len(temp_df))
print(Counter(temp_df['fastest_alg']))
print(f"Max difference: {max(temp_df['obj.lasso-obj.glmnet'])}")
temp_df[temp_df['fastest_alg'] == 'glmnet']

# Tests 6

- tol = 1e-4
- run on the server, using the package built for the server
- seed is set as 5451. The goal is to check whether the result is similar to that in Test 5.

In [None]:
df = pd.read_csv("tests6.csv")
df = process_res_df(df)
print(Counter(df['fastest_alg']))
# df

In [None]:
ax = sns.histplot(data=df, x="lasso_glmnet_runtime_ratio", color='gray')
ax.axvline(1, color='red')
plt.show()

In [None]:
analyze_measure_by_factor(df, "runtime_acceptable", "snr", plot=False)
analyze_measure_by_factor(df, "runtime_acceptable", "n_p_ratio", plot=False)

In [None]:
analyze_measure_by_factor(df, "lasso_glmnet_runtime_ratio", "snr", plot=False)
analyze_measure_by_factor(df, "lasso_glmnet_runtime_ratio", "n_p_ratio", plot=False)
analyze_measure_by_factor(df, "obj.lasso", "snr", plot=False)
analyze_measure_by_factor(df, "obj.lasso", "n_p_ratio", plot=False)

In [None]:

obj_threshold = 1e-4
print(len(df))
print(f"Num of obj.lasso - obj.glmnet > threshold: {sum(df['obj.lasso-obj.glmnet'] > obj_threshold)}")
print(f"Num of obj.lasso - obj.ncvreg > threshold: {sum(df['obj.lasso-obj.ncvreg'] > obj_threshold)}")
print(f"Num of obj.lasso - obj.ncvreg > threshold && obj.lasso - obj.glmnet > threshold: {sum((df['obj.lasso-obj.ncvreg'] > obj_threshold) & (df['obj.lasso-obj.ncvreg'] > obj_threshold))}")

In [None]:

obj_threshold = 0
print(len(df))
print(f"Num of obj.lasso - obj.glmnet > threshold: {sum(df['obj.lasso-obj.glmnet'] > obj_threshold)}")
print(f"Num of obj.lasso - obj.ncvreg > threshold: {sum(df['obj.lasso-obj.ncvreg'] > obj_threshold)}")
print(f"Num of obj.lasso - obj.ncvreg > threshold && obj.lasso - obj.glmnet > threshold: {sum((df['obj.lasso-obj.ncvreg'] > obj_threshold) & (df['obj.lasso-obj.ncvreg'] > obj_threshold))}")

In [None]:

obj_threshold = 0
temp_df = df[df['n_p_ratio'] >= 1]
print(len(temp_df))
print(f"Num of obj.lasso - obj.glmnet > threshold: {sum(temp_df['obj.lasso-obj.glmnet'] > obj_threshold)}")
print(f"Num of obj.lasso - obj.ncvreg > threshold: {sum(temp_df['obj.lasso-obj.ncvreg'] > obj_threshold)}")
print(f"Num of obj.lasso - obj.ncvreg > threshold && obj.lasso - obj.glmnet > threshold: {sum((temp_df['obj.lasso-obj.ncvreg'] > obj_threshold) & (temp_df['obj.lasso-obj.ncvreg'] > obj_threshold))}")

# Tests 5

- tol = 1e-4
- run on the server, using the package built for the server

In [None]:
df = pd.read_csv("tests5.csv")
df = process_res_df(df)
print(Counter(df['fastest_alg']))
df

In [None]:
ax = sns.histplot(data=df, x="lasso_glmnet_runtime_ratio", color='gray')
ax.axvline(1, color='red')
plt.show()

In [None]:
analyze_measure_by_factor(df, "runtime_acceptable", "snr", plot=False)
analyze_measure_by_factor(df, "runtime_acceptable", "n_p_ratio", plot=False)

In [None]:
analyze_measure_by_factor(df, "lasso_glmnet_runtime_ratio", "snr", plot=False)
analyze_measure_by_factor(df, "lasso_glmnet_runtime_ratio", "n_p_ratio", plot=False)
analyze_measure_by_factor(df, "obj.lasso", "snr", plot=False)
analyze_measure_by_factor(df, "obj.lasso", "n_p_ratio", plot=False)

In [None]:

obj_threshold = 1e-4
print(f"Num of obj.lasso - obj.glmnet > threshold: {sum(df['obj.lasso-obj.glmnet'] > obj_threshold)}")
print(f"Num of obj.lasso - obj.ncvreg > threshold: {sum(df['obj.lasso-obj.ncvreg'] > obj_threshold)}")
print(f"Num of obj.lasso - obj.ncvreg > threshold && obj.lasso - obj.glmnet > threshold: {sum((df['obj.lasso-obj.ncvreg'] > obj_threshold) & (df['obj.lasso-obj.ncvreg'] > obj_threshold))}")

# Tests 3

- tol = 1e-4
- run on the server
- Note that the parameter settings are a bit different from those in test 2 and test 4.

In [None]:
df = pd.read_csv("tests3.csv")
df = process_res_df(df)
print(Counter(df['fastest_alg']))
# df

In [None]:
ax = sns.histplot(data=df, x="lasso_glmnet_runtime_ratio", color='gray')
ax.axvline(1, color='red')
plt.show()

In [None]:
analyze_measure_by_factor(df, "lasso_glmnet_runtime_ratio", "sig2noise.ratio")
analyze_measure_by_factor(df, "lasso_glmnet_runtime_ratio", "n_p_ratio")
analyze_measure_by_factor(df, "obj.lasso", "sig2noise.ratio")
analyze_measure_by_factor(df, "obj.lasso", "n_p_ratio")

In [None]:
obj_threshold = 1e-4
# df_obj = df[(df['obj.lasso-obj.glmnet'] > obj_threshold) | (df['obj.lasso-obj.ncvreg'] > obj_threshold)]
df_obj = df[(df['obj.lasso-obj.glmnet'] > obj_threshold)]
print(f"{len(df_obj)}")
df_obj

## Tests 2

- tol = 1e-5
- run on Yu's MacBook
- Since the tolerance is much smaller, lasso runs slower, and the accuracy is much higher, with only one case slightly worse than glmnet. 
- So for this test result, our major focus is how the two measures change along with the factors.
- Note that there is slight change in variable names.

In [None]:
df = pd.read_csv("tests2.csv")
df = process_res_df(df)
print(Counter(df['fastest_alg']))
df

In [None]:
analyze_measure_by_factor(df, "lasso_glmnet_runtime_ratio", "snr")
analyze_measure_by_factor(df, "lasso_glmnet_runtime_ratio", "n_p_ratio")
analyze_measure_by_factor(df, "obj.lasso", "snr")
analyze_measure_by_factor(df, "obj.lasso", "n_p_ratio")

In [None]:
obj_threshold = 0
df_obj = df[(df['obj.lasso-obj.glmnet'] > obj_threshold) | (df['obj.lasso-obj.ncvreg'] > obj_threshold)]
print(f"{len(df_obj)}")
df_obj

## Tests 4

- tol = 1e-5
- run on the DAGS server
- Since the tolerance is much smaller, lasso runs slower, and the accuracy is much higher, with only one case slightly worse than glmnet. 
- So for this test result, our major focus is how the two measures change along with the factors.

In [None]:
df = pd.read_csv("tests4.csv")
df = process_res_df(df)
print(Counter(df['fastest_alg']))
df

In [None]:
analyze_measure_by_factor(df, "lasso_glmnet_runtime_ratio", "snr")
analyze_measure_by_factor(df, "lasso_glmnet_runtime_ratio", "n_p_ratio")
analyze_measure_by_factor(df, "obj.lasso", "snr")
analyze_measure_by_factor(df, "obj.lasso", "n_p_ratio")

In [None]:
obj_threshold = 0
df_obj = df[(df['obj.lasso-obj.glmnet'] > obj_threshold) | (df['obj.lasso-obj.ncvreg'] > obj_threshold)]
print(f"{len(df_obj)}")
df_obj