## CMSC 35300 Final Project: Lasso Models
Shweta Kamath <br>
Nivedita Vatsa <br>
Carolyn Vilter

#### Setup

Source: https://gist.github.com/agramfort/ac52a57dc6551138e89b

In [186]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pltb
import seaborn as sns
from itertools import compress

In [187]:
import time
from math import sqrt
from scipy import linalg # used to calculate norm

In [188]:
import scipy.io as sio

In [189]:
# Import data
df = pd.read_csv("data/all_data_standardized.csv")

In [190]:
# Separate out Xs
X = df.loc[:, ~df.columns.isin(["child_id", "mother_id", "treat_alike_scale", "treat_alike_binary"])]
X = X.to_numpy()

# add columns of 1s
A = np.hstack((np.ones((len(X), 1)), X))
A_names = df.columns[(~df.columns.isin(["child_id", "mother_id", "treat_alike_scale", "treat_alike_binary"]))]

# Separate out two prospective ys
y_scale = df.loc[:, df.columns == "treat_alike_scale"]
y_scale = y_scale.to_numpy()

y_binary = df.loc[:, df.columns == "treat_alike_binary"]
y_binary = y_binary.to_numpy()

In [191]:
# matlab_data_file = sio.loadmat('face_emotion_data.mat')
# X = matlab_data_file['X']
# y_scale = matlab_data_file['y']

In [192]:
# # load data, make sure ‘fisheriris.mat‘ is in your working directory
# data = sio.loadmat("fisheriris.mat")

# # training data
# X = data['meas']
# y_scale = data['species']

In [193]:
df

Unnamed: 0,child_id,mother_id,treat_alike_scale,treat_alike_binary,year_at_y,gap_at_y,self_worth,yob_child,father_present,religion_freq,...,highest_expected_grade_GET MORE THAN 4 YEARS OF COLLEGE,highest_expected_grade_SOMETHING ELSE,mother_race_BLACK,"mother_race_NON-BLACK, NON-HISPANIC",mother_relig_raised_protestant,mother_relig_raised_catholic,mother_relig_raised_other,mother_relig_current_protestant,mother_relig_current_catholic,mother_relig_current_other
0,301,3,2,1,0.0,0.5,0.555556,0.045455,1.000000,0.2,...,0,0,0,1,0,1,0,0,1,0
1,302,3,1,1,0.1,0.5,1.000000,0.136364,1.000000,0.5,...,0,0,0,1,0,1,0,0,1,0
2,303,3,1,1,0.3,1.0,0.888889,0.272727,1.000000,0.8,...,0,0,0,1,0,1,0,0,0,1
3,803,8,1,1,0.1,1.0,0.611111,0.090909,0.363636,0.2,...,0,0,0,1,0,0,1,1,0,0
4,1601,16,1,1,0.5,1.0,0.666667,0.454545,1.000000,0.6,...,0,0,0,1,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5156,1255801,12558,2,1,0.4,0.5,1.000000,0.409091,0.000000,1.0,...,0,0,1,0,0,0,1,0,0,1
5157,1256602,12566,2,1,0.2,0.5,0.888889,0.227273,1.000000,1.0,...,0,0,1,0,1,0,0,1,0,0
5158,1256603,12566,1,1,0.4,1.0,0.888889,0.363636,0.888889,1.0,...,1,0,1,0,1,0,0,1,0,0
5159,1256604,12566,1,1,0.5,0.0,0.944444,0.545455,0.714286,1.0,...,1,0,1,0,1,0,0,1,0,0


### Lasso Regression
Predict repeatedly using cross validation; plot test error.

In [194]:
def soft_thresh(x, l):
    return np.sign(x) * np.maximum(np.abs(x) - l, 0.)


def ista(A, b, l, maxit):
    x = np.zeros(A.shape[1])
    pobj = []
    L = linalg.norm(A) ** 2  # Lipschitz constant
    time0 = time.time()
    for _ in range(maxit):
        x = soft_thresh(x + np.dot(A.T, b - A.dot(x)) / L, l / L)
        this_pobj = 0.5 * linalg.norm(A.dot(x) - b) ** 2 + l * linalg.norm(x, 1)
        pobj.append((time.time() - time0, this_pobj))

    times, pobj = map(np.array, zip(*pobj))
    return x, pobj, times


def fista(A, b, l, maxit):
    x = np.zeros(A.shape[1])
    pobj = []
    t = 1
    z = x.copy()
    L = linalg.norm(A) ** 2
    time0 = time.time()
    for _ in range(maxit):
        xold = x.copy()
        z = z + A.T.dot(b - A.dot(z)) / L
        x = soft_thresh(z, l / L)
        t0 = t
        t = (1. + sqrt(1. + 4. * t ** 2)) / 2.
        z = x + ((t0 - 1.) / t) * (x - xold)
        this_pobj = 0.5 * linalg.norm(A.dot(x) - b) ** 2 + l * linalg.norm(x, 1)
        pobj.append((time.time() - time0, this_pobj))

    times, pobj = map(np.array, zip(*pobj))
    return x, pobj, times


def predict_scale_labels(y_scale_pred):
    '''
    Assign label 1 through 4
    
    arg:
    - y_scale_pred: vector of predicted y values
    
    Returns: vector of predicted y labels (1 through 4)
    '''
    # generate nx4 matrix where each column is 1,2,3,4
    m = y_scale_pred.shape[0]
    y_scale_pred = y_scale_pred.reshape(m, 1)
    mat = np.tile(np.arange(4)+1, (m,1))
    
    # find lowest absolute distance
    return np.argmin(abs(mat - y_scale_pred), axis=1).reshape(m, 1)


def get_error_rate(y_label, y_label_pred):
    '''
    '''
    return 100*np.sum(y_label != y_label_pred)/len(y_label_pred)

## Attempt 1

In [195]:
# setup
rng = np.random.RandomState(42)
m, n = A.shape

#x0 = rng.rand(n)
#x0[x0 < 0.9] = 0
x0 = np.zeros(44)+0.5
b = np.dot(A, x0)
l = 0  # regularization parameter

maxit = 1000

In [196]:
x_ista,  pobj_ista,  times_ista  =  ista(A, b, l, maxit) # ista
y_scale_pred_ista  = predict_scale_labels(A.dot(x_ista))

x_fista, pobj_fista, times_fista = fista(A, b, l, maxit) # fista
y_scale_pred_fista = predict_scale_labels(A.dot(x_fista))

In [197]:
print("=============ISTA=============")

# errors
print("Error rate:", get_error_rate(y_scale, y_scale_pred_ista))

# values of non-zero parameters
filt_ista_zero = np.round(x_ista[1:], 4)==0.0

print("----------------non-zero coefficients----------------")
for feat, coef in zip(list(compress(A_names, ~filt_ista_zero)),
                      np.round(x_ista[1:], 3)[~filt_ista_zero]):
    print(f"{feat}: {coef}")
print("----------------zero coefficients----------------")
for feat, coef in zip(list(compress(A_names, filt_ista_zero)),
                      np.round(x_ista[1:], 3)[filt_ista_zero]):
    print(f"{feat}: {coef}")

Error rate: 96.0666537492734
----------------non-zero coefficients----------------
year_at_y: 0.498
gap_at_y: 0.482
self_worth: 0.491
yob_child: 0.505
father_present: 0.477
religion_freq: 0.509
hgc: 0.223
woman_place_in_the_home_birth: 0.466
woman_place_in_the_home_y: 0.484
wife_w_fam_no_tm_o_emp_birth: 0.461
wife_w_fam_no_tm_o_emp_y: 0.475
wrkng_sp_feel_more_useful_birth: 0.474
wrkng_sp_feel_more_useful_y: 0.493
emp_wife_lead_juv_delin_birth: 0.464
emp_wife_lead_juv_delin_y: 0.499
infl_neces_emp_2_parents_birth: 0.45
infl_neces_emp_2_parents_y: 0.492
trade_husband_wife_roles_birth: 0.461
trade_husband_wife_roles_y: 0.491
men_should_share_housework_birth: 0.371
men_should_share_housework_y: 0.394
women_happier_trade_roles_birth: 0.471
women_happier_trade_roles_y: 0.505
hours_works_at_y: 0.396
num_jobs_ever_at_y: 0.438
avg_poverty_status: 0.425
avg_fam_inc: 0.27
race_child_BLACK: 0.493
race_child_NON-BLACK, NON-HISPANIC: 0.496
sex_child_FEMALE: 0.496
highest_expected_grade_GRADUATE FROM

In [198]:
print("=============FISTA=============")

# errors
print("Error rate:", get_error_rate(y_scale, y_scale_pred_fista))

# values of non-zero parameters
filt_fista_zero = np.round(x_fista[1:], 4)==0.0

print("----------------non-zero coefficients----------------")
for feat, coef in zip(list(compress(A_names, ~filt_fista_zero)),
                      np.round(x_fista[1:], 3)[~filt_fista_zero]):
    print(f"{feat}: {coef}")
print("----------------zero coefficients----------------")
for feat, coef in zip(list(compress(A_names, filt_fista_zero)),
                      np.round(x_fista[1:], 3)[filt_fista_zero]):
    print(f"{feat}: {coef}")

Error rate: 96.0666537492734
----------------non-zero coefficients----------------
year_at_y: 0.522
gap_at_y: 0.498
self_worth: 0.5
yob_child: 0.476
father_present: 0.5
religion_freq: 0.5
hgc: 0.499
woman_place_in_the_home_birth: 0.501
woman_place_in_the_home_y: 0.498
wife_w_fam_no_tm_o_emp_birth: 0.499
wife_w_fam_no_tm_o_emp_y: 0.502
wrkng_sp_feel_more_useful_birth: 0.501
wrkng_sp_feel_more_useful_y: 0.499
emp_wife_lead_juv_delin_birth: 0.499
emp_wife_lead_juv_delin_y: 0.5
infl_neces_emp_2_parents_birth: 0.499
infl_neces_emp_2_parents_y: 0.5
trade_husband_wife_roles_birth: 0.5
trade_husband_wife_roles_y: 0.499
men_should_share_housework_birth: 0.501
men_should_share_housework_y: 0.499
women_happier_trade_roles_birth: 0.5
women_happier_trade_roles_y: 0.5
hours_works_at_y: 0.5
num_jobs_ever_at_y: 0.5
avg_poverty_status: 0.5
avg_fam_inc: 0.504
race_child_BLACK: 0.5
race_child_NON-BLACK, NON-HISPANIC: 0.5
sex_child_FEMALE: 0.5
highest_expected_grade_GRADUATE FROM HIGH SCHOOL: 0.492
highes

In [199]:
# import matplotlib.pyplot as plt
# plt.close('all')

# plt.figure()
# plt.stem(x0, markerfmt='go')
# plt.stem(x_ista, markerfmt='bo')
# plt.stem(x_fista, markerfmt='ro')

# plt.figure()
# plt.plot(times_ista, pobj_ista, label='ista')
# plt.plot(times_fista, pobj_fista, label='fista')
# plt.xlabel('Time')
# plt.ylabel('Primal')
# plt.legend()
# plt.show()