In [8]:
# import modules
import numpy as np
import matplotlib.pyplot as plt
import scipy
import seaborn as sns
import copy
import pylab
sns.set()

%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [12]:
#####################################

def load_yelp():
    # Load a csv of floats:
    X = np.genfromtxt("upvote_data.csv", delimiter=",")
    # Load a text file of integers:
    y = np.loadtxt("upvote_labels.txt", dtype=np.int)
    # Load a text file of strings:
    featureNames = open("upvote_features.txt").read().splitlines()
    
    # demean!
    X = pylab.demean(X,axis=0)
    return X,y,featureNames

def split(data,labels):
    n_train = 4000
    n_valid = 1000
    n_test = 1000
    
    train_d = data[0:n_train,:]
    train_l = labels[0:n_train]
    
    valid_d = data[n_train:n_train+n_valid,:]
    valid_l = labels[n_train:n_train+n_valid]
    
    test_d = data[n_valid::,:]
    test_l = labels[n_valid::]
    
    return train_d,valid_d,test_d,train_l,valid_l,test_l

### lasso!

def lambda_max_func(x,y):
    k = np.shape(x)[1]
    n = np.shape(x)[0]
    lambda_vec = []
    sum_y = (1/n)*np.sum(y[:])
#     for k_ind in np.arange(k):

#         lambda_val = 2*np.abs(np.dot(x[:,k_ind].T,(y-sum_y)))
#         lambda_vec.append(lambda_val)
#     lambda_max = np.max(lambda_vec)
    k_range = np.arange(k)
    lambda_max = np.max([2*np.abs(np.dot(x[:,k_ind].T,(y-sum_y))) for k_ind in k_range])
    return lambda_max

def lasso_CD(x,y,lambda_val,w_init):
    ### initialize conditions 
    delta = 1e-2
    n = np.shape(x)[0]
    d = np.shape(x)[1]

    stop_cond = 100
    w = w_init
    a = np.zeros((d,))
    c = np.zeros((d,))
    stop_cond_vec = []
    w_old = np.zeros((d,))

    while(stop_cond>delta):
        diff_y_x = y - np.dot(x,w)
        w_0 = (1/n)*np.sum(diff_y_x)

        for k in np.arange(d):
            a[k] = 2*np.dot(x[:,k].T,x[:,k])

            # subtract version
            
            diff_y_x_w0 = y - w_0 - np.dot(x,w) + np.dot(x[:,k],w[k])
            
            # bool version
#             mask = np.ones(d,dtype=bool)
#             mask[k] = 0
#             x_temp = x[:,mask]
#             w_temp = w[mask]

#             diff_y_x_w0 = y - w_0 - np.dot(x_temp,w_temp)
            
            c[k] = 2*np.dot(x[:,k].T,diff_y_x_w0)

            if c[k] < - lambda_val:
                w[k] = ((c[k]+lambda_val))/a[k]
            elif (-lambda_val < c[k] < lambda_val):
                w[k] = 0
            elif c[k] > lambda_val:
                w[k] = ((c[k]-lambda_val))/a[k]

        weight_diff = w - w_old
        w_old = np.array(w)

        stop_cond = np.max(weight_diff)
        print('value of lasso run {}'.format(stop_cond))
        #stop_cond_vec.append(stop_cond)
    
    return w

def lasso_loop(x_t,y_t,x_v,y_v):
    w_vec = []
    num_non_zero_vec = []
    lambda_val = lambda_max_func(x_t,y_t)
    lambda_vec = []
    valid_error_vec=[]
    train_error_vec = []
    valid_error = 100
    
    w_init = np.random.random(np.shape(x_t)[1])
    
    while (valid_error>(0.1)):
        print('the value of lambda is {}'.format(lambda_val))
        w = lasso_CD(x_t,y_t,lambda_val,w_init)
        w = lasso_CD(x_t,y_t,lambda_val,w_init)

        # total non zero 
        num_non_zero = sum((np.abs(x)>0 for x in w))
        
        w_init = np.array(w)
        w_vec.append(w)
        
        train_error = error_calc(x_t,y_t,w)
        valid_error = error_calc(x_v,y_v,w)
        num_non_zero_vec.append(num_non_zero)
        lambda_vec.append(lambda_val)
        valid_error_vec.append(valid_error)
        
        
        ratio = 1/5
        
        lambda_val = lambda_val*ratio
        print('The validation error was {}'.format(valid_error))
    
    return w_vec,num_non_zero_vec,lambda_vec,valid_error_vec,train_error_vec

def error_calc(x,y,w):
    n = np.shape(y)[0]
    error = (1/n)*np.sum((y-np.dot(x,w))**2)
    return error

def plot_valid_train(train_error_vec,valid_error_vec,num_non_zero_vec,lambda_vec):
   
    ax1= plt.figure(figsize=(4, 4), dpi=600)
    plt.plot(lambda_vec,train_error_vec,label='training error')
    plt.plot(lambda_vec,valid_error_vec,label='training error')
    plt.xlabel(r'$ \lambda $')
    plt.title('Training and Validation error for various values of Labmda ')
    plt.savefig('hw2_valid_train_error_kaggle.png')
    # plot recall 
    ax2 = plt.figure(figsize=(4, 4), dpi=600)
    plt.plot(lambda_vec,num_non_zero_vec)
    plt.xlabel(r'$ \lambda $')
    plt.ylable('Number of non zero entries')
    plt.title('Number of non zero entries for various values of lambda')
    plt.savefig('hw2_nonzero_kaggle.png')
    
def plot_lasso(w,stop_cond_vec,w_star):
    ax1 = sns.distplot(w, rug=True,hist=True,bins=100, kde=False, rug_kws={"color": "g"},hist_kws={"histtype": "step", "linewidth": 3, "alpha": 1, "color": "g"})

    plt.figure()
    ax3 = plt.bar(np.arange(len(w_star)),w_star,label='w*')
    ax2 = plt.bar(np.arange(len(w)),1.1*w,label='computed weight matrix')
    plt.legend()
    plt.ylim([-10,10])

    plt.figure()
    ax3 = plt.plot(np.arange(len(stop_cond_vec)),stop_cond_vec)
    plt.ylim([0,0.05])

### part a

In [10]:
# load data
X,y,featureNames = load_yelp()

# split data
train_d,valid_d,test_d,train_l,valid_l,test_l = split(X,y)


In [13]:
w_vec,num_non_zero_vec,lambda_vec,valid_error_vec,train_error_vec = lasso_loop(train_d,train_l,valid_d,valid_l)

plot_valid_train(train_error_vec,valid_error_vec,num_non_zero_vec,lambda_vec)


the value of lambda is 145.92594572640883
value of lasso run 10.07835148923803
value of lasso run 193.70816979941088
value of lasso run 1.126362950887687e-05
value of lasso run 8.11718172553599e-11
The validation error was 7.259999999995968
the value of lambda is 29.185189145281768
value of lasso run 72.01394827620652
value of lasso run 11.347650975490152
value of lasso run 11.580385962907336
value of lasso run 10.310089305737044
value of lasso run 8.98532047558377
value of lasso run 7.604538895147961
value of lasso run 6.193898364798329
value of lasso run 4.496516294931496
value of lasso run 2.7962195231260125
value of lasso run 1.1490778453293613
value of lasso run 1.0433641004981524
value of lasso run 0.9225823904777766
value of lasso run 0.8145497629972933
value of lasso run 0.7175357122610393
value of lasso run 0.6300880907665487
value of lasso run 0.5514315375138228
value of lasso run 0.4102758323532214
value of lasso run 0.39017100247649505
value of lasso run 0.4235682805065739


KeyboardInterrupt: 

### part b

In [33]:
# lambda optimal
lambda_opt = ...
w_init = w_vec[:,...]
w_best,stop_cond_vec = lasso_CD(test_d,test_l,lambda_opt,w_init)

test_error = error_calc(test_d,test_l,w_best)
print('The test error is {}'.format(test_error))


### part c

In [44]:
inds_top10 = np.argpartition(w, -10)[-10:]

top10_weight = w[inds_top10]
top10_featfeaturesNames[inds_top10]

['sq(UserFunnyVotes*BusinessNumStars)',
 'ReviewNumCharacters/BusinessLongitude',
 'sq(ReviewNumWords*InMesa)',
 'log(IsRestaurant/UserAverageStars)',
 'sqrt(IsAutomotive*BusinessNumStars)',
 'ReviewNumLineBreaks*ReviewInSpring',
 'sqrt(IsAutomotive/UserFunnyVotes)',
 'log(IsShopping/BusinessNumStars)',
 'sqrt(IsRestaurant/InGilbert)',
 'sqrt(ReviewInSummer/UserUsefulVotes)']

In [31]:
b = np.array([[1,1,1],[2,2,2],[3,3,3]])
np.shape(b)
pylab.demean(b,axis=0)
b
(0.708+0.58+0.368)/3
0.0535-0.55

(3, 3)

array([[-1., -1., -1.],
       [ 0.,  0.,  0.],
       [ 1.,  1.,  1.]])

array([[1, 1, 1],
       [2, 2, 2],
       [3, 3, 3]])

0.5519999999999999

-0.49650000000000005

In [32]:
X = np.genfromtxt("upvote_data.csv", delimiter=",")


In [None]:
np.shape(X)

In [40]:
w_vec

NameError: name 'w_vec' is not defined