In [57]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.linear_model import ElasticNet, Lasso, Ridge, LinearRegression
from sklearn.preprocessing import OneHotEncoder
from scipy import hstack
from sklearn import metrics
import time

## Cross-validation

In [4]:
df = pd.read_csv('./student-mat.csv', sep=';')

In [6]:
df.shape

(395, 33)

In [12]:
indexes = shuffle(range(395), random_state=2019425)

In [14]:
395/5

79.0

In [18]:
test = df.iloc[indexes[:79], :]

In [22]:
test.to_csv("./test_set.csv", index=False)

In [23]:
other_indexes = indexes[79:]

In [26]:
len(other_indexes) / 5

63.2

In [27]:
cv_1 = other_indexes[:63]
cv_2 = other_indexes[63:63*2]
cv_3 = other_indexes[63*2:63*3]
cv_4 = other_indexes[63*3:63*4]
cv_5 = other_indexes[63*4:]

In [35]:
df_1 = df.iloc[cv_1, :]
df_2 = df.iloc[cv_2, :]
df_3 = df.iloc[cv_3, :]
df_4 = df.iloc[cv_4, :]
df_5 = df.iloc[cv_5, :]

In [37]:
df_1.to_csv("./cv_1.csv", index=False)
df_2.to_csv("./cv_2.csv", index=False)
df_3.to_csv("./cv_3.csv", index=False)
df_4.to_csv("./cv_4.csv", index=False)
df_5.to_csv("./cv_5.csv", index=False)

## Elastic Net 

In [39]:
df_1.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
178,GP,M,16,R,GT3,T,4,2,teacher,services,...,4,3,3,3,4,3,10,10,8,9
35,GP,F,15,U,GT3,T,2,3,other,other,...,3,5,1,1,1,5,0,8,7,6
136,GP,M,17,R,GT3,T,3,4,at_home,other,...,5,4,5,2,4,5,0,10,0,0
273,GP,M,17,R,GT3,T,1,2,at_home,at_home,...,3,5,2,2,2,1,2,15,14,14
31,GP,M,15,U,GT3,T,4,4,services,services,...,4,3,1,1,1,5,0,17,16,17


In [199]:
df_dropped = df.drop(['Mjob', 'Fjob', 'G3', 'age'], axis=1)
enc = OneHotEncoder(handle_unknown='error')
df_array = df_dropped.values
enc.fit(df_array)

OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='error',
       n_values=None, sparse=True)

In [200]:
train_df = pd.concat([df_1, df_2, df_3, df_4])
train_y = train_df['G3'].values
train_age = train_df['age'].values.reshape(-1,1)
train_df_dropped = train_df.drop(['Mjob', 'Fjob', 'G3', 'age'], axis=1)

train_x = train_df_dropped.values
train_x = enc.transform(train_x).toarray()
train_x = np.hstack((train_x, train_age))

In [202]:
vali_df = df_5
vali_y = vali_df['G3'].values
vali_age = vali_df['age'].values.reshape(-1,1)
vali_df_dropped = vali_df.drop(['Mjob', 'Fjob', 'G3', 'age'], axis=1)

vali_x = vali_df_dropped.values
vali_x = enc.transform(vali_x).toarray()
vali_x = np.hstack((vali_x, vali_age))

In [203]:
model = ElasticNet(alpha=0.0001,
                   l1_ratio=0.5,
                   fit_intercept=True,
                   normalize=False,
                   max_iter=1000000)

In [204]:
model.fit(train_x, train_y)

ElasticNet(alpha=0.0001, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [205]:
model.predict(vali_x)

array([12.22110492,  9.81054282,  6.24177337, 10.86259956,  7.41785403,
        2.95522143,  4.00034319,  1.7683242 ,  9.88424141,  1.68458363,
        9.36314234, 11.47442383, 11.87496734,  9.38796563,  4.40629454,
        8.60780055, 11.29986719,  6.3779329 ,  7.83398395,  8.55770356,
       13.75772127, 17.18309051, 15.27944306, 11.20389637,  8.36358143,
       12.49709544, 15.74385738, 15.38951099,  9.39458522, 19.48920212,
       15.65057032, 17.2457531 ,  7.25061365,  8.23857145, 11.71813551,
       16.77352179, 13.56904588,  9.69327651,  6.14495702, 14.45839539,
       16.4837604 ,  7.97059553,  9.80078867, 13.77118199, 12.17071586,
       16.28786841,  7.99371857,  9.65123163, 10.62324109,  9.2066112 ,
        1.45361915, 22.31517268, 10.51332729,  5.25718884,  7.09831589,
       16.77967787,  9.40148589, 13.2870069 , 12.95590306,  5.32648298,
       13.08454616,  4.37068906,  9.43302179,  8.02412121])

In [206]:
metrics.r2_score(vali_y, model.predict(vali_x))

0.7967092734169612

In [197]:
vali_y - model.predict(vali_x)

array([ -1.50495115,  -3.62269254,  -7.22074182,   0.33206069,
        -4.30856569,  -6.18751209,  -5.31187799, -11.06254263,
         0.81699557, -10.8592637 ,  -4.9978621 ,   0.09900007,
         7.92374535,   3.12904313,  -2.77095466,   0.73896963,
         1.59241186,   0.97085375,  -1.41745857,  -1.48418929,
         4.25853104,  -0.29046539,   1.78598716,   5.09983871,
        -5.01398082,   2.13439982,   2.74227264,  -3.45303783,
         8.12504586,   2.15096553,   1.01309351,   3.62929165,
       -12.01109608,  -5.76093819,   2.18607845,   3.75330656,
        -2.35045977,   0.66096781,   1.08773496,   1.78614447,
         2.34126652,  -1.83117569,  -1.15093038,  -2.73985006,
        -1.67674071,   0.77623743,  -1.61224662,   2.08633606,
         1.2232216 ,  -9.28191647,  -4.74202887,   3.74962122,
        -1.61825116,  -2.19250961,   4.65056703,   1.73294951,
         0.59187049,   1.61837623,  -0.3715606 ,   4.66644934,
         5.06523211, -10.68127746,  -2.67182892,  -2.35

## Cross Validation

In [2]:
df = pd.read_csv('./student-mat.csv', sep=';')

df_1 = pd.read_csv("./cv_1.csv")
df_2 = pd.read_csv("./cv_2.csv")
df_3 = pd.read_csv("./cv_3.csv")
df_4 = pd.read_csv("./cv_4.csv")
df_5 = pd.read_csv("./cv_5.csv")

In [117]:
cv_dict = {
    1: {
        'train': [df_1, df_2, df_3, df_4],
        'vali': df_5
    },
    2: {
        'train': [df_1, df_2, df_3, df_5],
        'vali': df_4
    },
    3: {
        'train': [df_1, df_2, df_5, df_4],
        'vali': df_3
    },
    4: {
        'train': [df_1, df_5, df_3, df_4],
        'vali': df_2
    },
    5: {
        'train': [df_5, df_2, df_3, df_4],
        'vali': df_1
    }
}

In [118]:
df_dropped = df.drop(['Mjob', 'Fjob', 'G1', 'G2', 'G3', 'age', 'absences'], axis=1)
enc = OneHotEncoder(handle_unknown='error')
df_array = df_dropped.values
enc.fit(df_array)

OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='error',
       n_values=None, sparse=True)

In [119]:
for cv in cv_dict:
    # Training set
    train_df = pd.concat(cv_dict[cv]['train'])
    train_y = train_df['G3'].values
    train_age = train_df['age'].values.reshape(-1,1)
    train_g1 = train_df['G1'].values.reshape(-1,1)
    train_g2 = train_df['G2'].values.reshape(-1,1)
    train_abs = train_df['absences'].values.reshape(-1,1)
    train_df_dropped = train_df.drop(['Mjob', 'Fjob', 'G1', 'G2', 'G3', 'age', 'absences'], axis=1)

    train_x = train_df_dropped.values
    train_x = enc.transform(train_x).toarray()
    train_x = np.hstack((train_x, train_age, train_abs, train_g1, train_g2))
    np.savez('./cv_npz/cv_{}_train.npz'.format(cv), train_x=train_x, train_y=train_y)

    # Validation set
    vali_df = cv_dict[cv]['vali']
    vali_y = vali_df['G3'].values
    vali_age = vali_df['age'].values.reshape(-1,1)
    vali_g1 = vali_df['G1'].values.reshape(-1,1)
    vali_g2 = vali_df['G2'].values.reshape(-1,1)
    vali_abs = vali_df['absences'].values.reshape(-1,1)
    vali_df_dropped = vali_df.drop(['Mjob', 'Fjob', 'G1', 'G2', 'G3', 'age', 'absences'], axis=1)

    vali_x = vali_df_dropped.values
    vali_x = enc.transform(vali_x).toarray()
    vali_x = np.hstack((vali_x, vali_age, vali_abs, vali_g1, vali_g2))
    np.savez('./cv_npz/cv_{}_vali.npz'.format(cv), vali_x=vali_x, vali_y=vali_y)

### Elastic Net

In [50]:
df_dropped = df.drop(['Mjob', 'Fjob', 'G1', 'G2', 'G3', 'age', 'absences'], axis=1)
enc = OneHotEncoder(handle_unknown='error')
df_array = df_dropped.values
enc.fit(df_array)


alphas = [1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001]
blends = [i/10 for i in range(1, 10)]
keys = [(i, j) for i in alphas for j in blends]
en_results = dict(zip(keys, [[0, 0,] for i in keys]))

for alpha, blend in en_results:
    cur_mse = []
    cur_r2 = []
    
    for cv in cv_dict:
        # Training set
        train_df = pd.concat(cv_dict[cv]['train'])
        train_y = train_df['G3'].values
        train_age = train_df['age'].values.reshape(-1,1)
        train_g1 = train_df['G1'].values.reshape(-1,1)
        train_g2 = train_df['G2'].values.reshape(-1,1)
        train_abs = train_df['absences'].values.reshape(-1,1)
        train_df_dropped = train_df.drop(['Mjob', 'Fjob', 'G1', 'G2', 'G3', 'age', 'absences'], axis=1)

        train_x = train_df_dropped.values
        train_x = enc.transform(train_x).toarray()
        train_x = np.hstack((train_x, train_age, train_abs, train_g1, train_g2))

        # Validation set
        vali_df = cv_dict[cv]['vali']
        vali_y = vali_df['G3'].values
        vali_age = vali_df['age'].values.reshape(-1,1)
        vali_g1 = vali_df['G1'].values.reshape(-1,1)
        vali_g2 = vali_df['G2'].values.reshape(-1,1)
        vali_abs = vali_df['absences'].values.reshape(-1,1)
        vali_df_dropped = vali_df.drop(['Mjob', 'Fjob', 'G1', 'G2', 'G3', 'age', 'absences'], axis=1)

        vali_x = vali_df_dropped.values
        vali_x = enc.transform(vali_x).toarray()
        vali_x = np.hstack((vali_x, vali_age, vali_abs, vali_g1, vali_g2))

        # Train the model
        model = ElasticNet(alpha=alpha,
                       l1_ratio=blend,
                       fit_intercept=True,
                       normalize=False,
                       max_iter=1000000)
        model.fit(train_x, train_y)

        # Test on the validation set
        vali_predict = model.predict(vali_x)

        # Track performance
        cur_mse.append(metrics.mean_squared_error(vali_y, vali_predict))
        cur_r2.append(metrics.r2_score(vali_y, vali_predict))
        
    en_results[(alpha, blend)] = [np.mean(cur_mse), np.mean(cur_r2)]
    
en_results

{(1000, 0.1): [21.37455452727478, -0.05116282882839753],
 (1000, 0.2): [21.37455452727478, -0.05116282882839753],
 (1000, 0.3): [21.37455452727478, -0.05116282882839753],
 (1000, 0.4): [21.37455452727478, -0.05116282882839753],
 (1000, 0.5): [21.37455452727478, -0.05116282882839753],
 (1000, 0.6): [21.37455452727478, -0.05116282882839753],
 (1000, 0.7): [21.37455452727478, -0.05116282882839753],
 (1000, 0.8): [21.37455452727478, -0.05116282882839753],
 (1000, 0.9): [21.37455452727478, -0.05116282882839753],
 (100, 0.1): [19.546031366636306, 0.04259275723928502],
 (100, 0.2): [21.37455452727478, -0.05116282882839753],
 (100, 0.3): [21.37455452727478, -0.05116282882839753],
 (100, 0.4): [21.37455452727478, -0.05116282882839753],
 (100, 0.5): [21.37455452727478, -0.05116282882839753],
 (100, 0.6): [21.37455452727478, -0.05116282882839753],
 (100, 0.7): [21.37455452727478, -0.05116282882839753],
 (100, 0.8): [21.37455452727478, -0.05116282882839753],
 (100, 0.9): [21.37455452727478, -0.051

### Lasso

In [29]:
df_dropped = df.drop(['Mjob', 'Fjob', 'G1', 'G2', 'G3', 'age', 'absences'], axis=1)
enc = OneHotEncoder(handle_unknown='error')
df_array = df_dropped.values
enc.fit(df_array)

lasso_results = dict(zip([1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001], [[0, 0] for _ in range(7)]))

for alpha in en_results:
    cur_mse = []
    cur_r2 = []
    
    for cv in cv_dict:
        # Training set
        train_df = pd.concat(cv_dict[cv]['train'])
        train_y = train_df['G3'].values
        train_age = train_df['age'].values.reshape(-1,1)
        train_g1 = train_df['G1'].values.reshape(-1,1)
        train_g2 = train_df['G2'].values.reshape(-1,1)
        train_abs = train_df['absences'].values.reshape(-1,1)
        train_df_dropped = train_df.drop(['Mjob', 'Fjob', 'G1', 'G2', 'G3', 'age', 'absences'], axis=1)

        train_x = train_df_dropped.values
        train_x = enc.transform(train_x).toarray()
        train_x = np.hstack((train_x, train_age, train_abs, train_g1, train_g2))

        # Validation set
        vali_df = cv_dict[cv]['vali']
        vali_y = vali_df['G3'].values
        vali_age = vali_df['age'].values.reshape(-1,1)
        vali_g1 = vali_df['G1'].values.reshape(-1,1)
        vali_g2 = vali_df['G2'].values.reshape(-1,1)
        vali_abs = vali_df['absences'].values.reshape(-1,1)
        vali_df_dropped = vali_df.drop(['Mjob', 'Fjob', 'G1', 'G2', 'G3', 'age', 'absences'], axis=1)

        vali_x = vali_df_dropped.values
        vali_x = enc.transform(vali_x).toarray()
        vali_x = np.hstack((vali_x, vali_age, vali_abs, vali_g1, vali_g2))

        # Train the model
        model = Lasso(alpha=alpha,
                       fit_intercept=True,
                       normalize=False,
                       max_iter=1000000)
        model.fit(train_x, train_y)

        # Test on the validation set
        vali_predict = model.predict(vali_x)

        # Track performance
        cur_mse.append(metrics.mean_squared_error(vali_y, vali_predict))
        cur_r2.append(metrics.r2_score(vali_y, vali_predict))
        
    lasso_results[alpha] = [np.mean(cur_mse), np.mean(cur_r2)]
    
lasso_results

{1000: [21.37455452727478, -0.05116282882839753],
 100: [21.37455452727478, -0.05116282882839753],
 10: [11.533374623015773, 0.4490276107684511],
 1: [4.09560842243497, 0.8132837057566938],
 0.1: [3.998478043934745, 0.8153912770967733],
 0.01: [4.61010027696618, 0.7787058345711817],
 0.001: [5.164607930655503, 0.7487194171398851]}

### Ridge

In [30]:
df_dropped = df.drop(['Mjob', 'Fjob', 'G1', 'G2', 'G3', 'age', 'absences'], axis=1)
enc = OneHotEncoder(handle_unknown='error')
df_array = df_dropped.values
enc.fit(df_array)

ridge_results = dict(zip([1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001], [[0, 0] for _ in range(7)]))

for alpha in en_results:
    cur_mse = []
    cur_r2 = []
    
    for cv in cv_dict:
        # Training set
        train_df = pd.concat(cv_dict[cv]['train'])
        train_y = train_df['G3'].values
        train_age = train_df['age'].values.reshape(-1,1)
        train_g1 = train_df['G1'].values.reshape(-1,1)
        train_g2 = train_df['G2'].values.reshape(-1,1)
        train_abs = train_df['absences'].values.reshape(-1,1)
        train_df_dropped = train_df.drop(['Mjob', 'Fjob', 'G1', 'G2', 'G3', 'age', 'absences'], axis=1)

        train_x = train_df_dropped.values
        train_x = enc.transform(train_x).toarray()
        train_x = np.hstack((train_x, train_age, train_abs, train_g1, train_g2))

        # Validation set
        vali_df = cv_dict[cv]['vali']
        vali_y = vali_df['G3'].values
        vali_age = vali_df['age'].values.reshape(-1,1)
        vali_g1 = vali_df['G1'].values.reshape(-1,1)
        vali_g2 = vali_df['G2'].values.reshape(-1,1)
        vali_abs = vali_df['absences'].values.reshape(-1,1)
        vali_df_dropped = vali_df.drop(['Mjob', 'Fjob', 'G1', 'G2', 'G3', 'age', 'absences'], axis=1)

        vali_x = vali_df_dropped.values
        vali_x = enc.transform(vali_x).toarray()
        vali_x = np.hstack((vali_x, vali_age, vali_abs, vali_g1, vali_g2))

        # Train the model
        model = Ridge(alpha=alpha,
                       fit_intercept=True,
                       normalize=False,
                       max_iter=1000000)
        model.fit(train_x, train_y)

        # Test on the validation set
        vali_predict = model.predict(vali_x)

        # Track performance
        cur_mse.append(metrics.mean_squared_error(vali_y, vali_predict))
        cur_r2.append(metrics.r2_score(vali_y, vali_predict))
        
    ridge_results[alpha] = [np.mean(cur_mse), np.mean(cur_r2)]
    
ridge_results

{1000: [4.536146433241242, 0.7900682962275586],
 100: [3.958281058688877, 0.8165398991364698],
 10: [4.45327771043412, 0.7883942624486295],
 1: [5.064399274295008, 0.7543514625844503],
 0.1: [5.227506558402414, 0.7454936053710959],
 0.01: [5.248344000188128, 0.7443709184222074],
 0.001: [5.25050204198714, 0.7442547695099357]}

## Test

In [120]:
df_dropped = df.drop(['Mjob', 'Fjob', 'G1', 'G2', 'G3', 'age', 'absences'], axis=1)
enc = OneHotEncoder(handle_unknown='error')
df_array = df_dropped.values
enc.fit(df_array)

# Training set
train_df = pd.concat([df_1, df_2, df_3, df_4, df_5])
train_y = train_df['G3'].values
train_age = train_df['age'].values.reshape(-1,1)
train_g1 = train_df['G1'].values.reshape(-1,1)
train_g2 = train_df['G2'].values.reshape(-1,1)
train_abs = train_df['absences'].values.reshape(-1,1)
train_df_dropped = train_df.drop(['Mjob', 'Fjob', 'G1', 'G2', 'G3', 'age', 'absences'], axis=1)

train_x = train_df_dropped.values
train_x = enc.transform(train_x).toarray()
train_x = np.hstack((train_x, train_age, train_abs, train_g1, train_g2))

# Test set
test_df = pd.read_csv('./test_set.csv')
test_y = test_df['G3'].values
test_age = test_df['age'].values.reshape(-1,1)
test_g1 = test_df['G1'].values.reshape(-1,1)
test_g2 = test_df['G2'].values.reshape(-1,1)
test_abs = test_df['absences'].values.reshape(-1,1)
test_df_dropped = test_df.drop(['Mjob', 'Fjob', 'G1', 'G2', 'G3', 'age', 'absences'], axis=1)

test_x = test_df_dropped.values
test_x = enc.transform(test_x).toarray()
test_x = np.hstack((test_x, test_age, test_abs, test_g1, test_g2))

np.savez('./cv_npz/test_train.npz'.format(cv), train_x=train_x, train_y=train_y)
np.savez('./cv_npz/test_test.npz'.format(cv), test_x=test_x, test_y=test_y)

In [35]:
test_result = {'lasso': [], 'ridge': [], 'en': [], 'no_reg': []}
best_params = {'lasso': 0.1, 'ridge': 100, 'en': [0.1, 0.7]}

In [37]:
model = ElasticNet(alpha=best_params['en'][0],
               l1_ratio=best_params['en'][1],
               fit_intercept=True,
               normalize=False,
               max_iter=1000000)
model.fit(train_x, train_y)

# Test on the validation set
test_predict = model.predict(test_x)

# Track performance
test_result['en'] = [metrics.mean_squared_error(test_y, test_predict),
                     metrics.r2_score(test_y, test_predict)]

model = Lasso(alpha=best_params['lasso'],
               fit_intercept=True,
               normalize=False,
               max_iter=1000000)
model.fit(train_x, train_y)

# Test on the validation set
test_predict = model.predict(test_x)

# Track performance
test_result['lasso'] = [metrics.mean_squared_error(test_y, test_predict),
                        metrics.r2_score(test_y, test_predict)]

model = Ridge(alpha=best_params['ridge'],
               fit_intercept=True,
               normalize=False,
               max_iter=1000000)
model.fit(train_x, train_y)


# Test on the validation set
test_predict = model.predict(test_x)

# Track performance
test_result['ridge'] = [metrics.mean_squared_error(test_y, test_predict),
                        metrics.r2_score(test_y, test_predict)]

model = LinearRegression(fit_intercept=True, normalize=False)
model.fit(train_x, train_y)


# Test on the validation set
test_predict = model.predict(test_x)

# Track performance
test_result['no_reg'] = [metrics.mean_squared_error(test_y, test_predict),
                         metrics.r2_score(test_y, test_predict)]

In [38]:
test_result

{'lasso': [2.8231742775443682, 0.8614040127575797],
 'ridge': [2.8536100412196164, 0.8599098525324742],
 'en': [2.7708078834745207, 0.8639747970489233],
 'no_reg': [3.480643846476052, 0.829127350026296]}

### Closed Form Regression

$$\hat{\beta}=(X^{T}X)^{-1}X^{T}y$$

In [102]:
l_1 = np.linalg.inv(np.matmul(train_x.transpose(), train_x))
l_2 = train_x.transpose()
l_3 = train_y

In [110]:
beta = np.matmul(np.matmul(l_1, l_2), l_3)

(89,)

## Elastic Net Implementation

In [6]:
df_dropped = df.drop(['Mjob', 'Fjob', 'G1', 'G2', 'G3', 'age', 'absences'], axis=1)
enc = OneHotEncoder(handle_unknown='error')
df_array = df_dropped.values
enc.fit(df_array)

# Training set
train_df = pd.concat([df_1, df_2, df_3, df_4, df_5])
train_y = train_df['G3'].values
train_age = train_df['age'].values.reshape(-1,1)
train_g1 = train_df['G1'].values.reshape(-1,1)
train_g2 = train_df['G2'].values.reshape(-1,1)
train_abs = train_df['absences'].values.reshape(-1,1)
train_df_dropped = train_df.drop(['Mjob', 'Fjob', 'G1', 'G2', 'G3', 'age', 'absences'], axis=1)

train_x = train_df_dropped.values
train_x = enc.transform(train_x).toarray()
train_x = np.hstack((train_x, train_age, train_abs, train_g1, train_g2))

# Test set
test_df = pd.read_csv('./test_set.csv')
test_y = test_df['G3'].values
test_age = test_df['age'].values.reshape(-1,1)
test_g1 = test_df['G1'].values.reshape(-1,1)
test_g2 = test_df['G2'].values.reshape(-1,1)
test_abs = test_df['absences'].values.reshape(-1,1)
test_df_dropped = test_df.drop(['Mjob', 'Fjob', 'G1', 'G2', 'G3', 'age', 'absences'], axis=1)

test_x = test_df_dropped.values
test_x = enc.transform(test_x).toarray()
test_x = np.hstack((test_x, test_age, test_abs, test_g1, test_g2))

In [130]:
def en_train(x, y, alpha=0.5, lamb=0.1, max_iter=1000, epsilon=1e-6):
    
    coef_track = []
    
    # Add a column for intercept coef
    train_x = np.column_stack((np.ones(len(x)), x))

    # Just initialize all coefs to 0
    weights = np.zeros(train_x.shape[1])
    
    # Compute c_1, c_2 (it is same for every j)
    c_1 = 2 * lamb * (1 - alpha)
    c_2 = lamb * alpha

    # No converge stopping
    for iter in range(max_iter):
        old_weights = weights.copy()

        # Iterate through all coefficients
        for j in range(len(weights)):
            
            # Exclude the contribution of current coefficient
            cur_weights = weights.copy()
            cur_weights[j] = 0
            
            # Compute p_j (1): residuals (without contribution of w_j)
            predict = np.dot(train_x, cur_weights)
            residual = y - predict
            
            # Compute p_j (2): weight residuals by x_j and take sum (dot product)
            p_j = np.dot(train_x[:, j], residual)
            
            # Compute z_j
            z_j = np.sum(train_x[:, j] ** 2)
            
            # Compare p_j with alpha * lambda and update weights
            if p_j < -c_2:
                weights[j] = (p_j + c_2) / (z_j + c_1)
            elif p_j > c_2:
                weights[j] = (p_j - c_2) / (z_j + c_1)
            else:
                weights[j] = 0
                
        # Need to specially handle the intercept (best intercept)
        best_intercepts = y - np.dot(train_x[:, 1:], weights[1:])
        weights[0] = np.sum(best_intercepts) / (train_x.shape[0])
        
        coef_track.append(weights.tolist())
        
        # If the update is too small, we stop the iteration
        max_update = np.max(np.abs(weights - old_weights))
        if max_update < epsilon:
            # print("Early stop: at iteration {}".format(iter))
            return (weights, coef_track)
    
    return (weights, coef_track)


def en_predict(x, coef):
    x_padding = np.column_stack((np.ones(len(x)), x))
    y_predict = np.dot(x_padding, coefs)
    return y_predict

In [50]:
coefs = en_train(train_x, train_y, alpha=0.7, lamb=0.1, max_iter=10000)

In [44]:
predict(test_x, test_y, coefs)

[3.4641526432063547, 0.8299369403573496]

In [58]:
start = time.time()

df_dropped = df.drop(['Mjob', 'Fjob', 'G1', 'G2', 'G3', 'age', 'absences'], axis=1)
enc = OneHotEncoder(handle_unknown='error')
df_array = df_dropped.values
enc.fit(df_array)

alphas = [1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001]
blends = [i/10 for i in range(1, 10)]
keys = [(i, j) for i in alphas for j in blends]
en_results = dict(zip(keys, [[0, 0,] for i in keys]))
count = 0

for alpha, blend in en_results:
    count += 1
    print(count)
    
    cur_mse = []
    cur_r2 = []
    cur_ar2 = []
    
    for cv in cv_dict:
        # Training set
        train_df = pd.concat(cv_dict[cv]['train'])
        train_y = train_df['G3'].values
        train_age = train_df['age'].values.reshape(-1,1)
        train_g1 = train_df['G1'].values.reshape(-1,1)
        train_g2 = train_df['G2'].values.reshape(-1,1)
        train_abs = train_df['absences'].values.reshape(-1,1)
        train_df_dropped = train_df.drop(['Mjob', 'Fjob', 'G1', 'G2', 'G3', 'age', 'absences'], axis=1)

        train_x = train_df_dropped.values
        train_x = enc.transform(train_x).toarray()
        train_x = np.hstack((train_x, train_age, train_abs, train_g1, train_g2))

        # Validation set
        vali_df = cv_dict[cv]['vali']
        vali_y = vali_df['G3'].values
        vali_age = vali_df['age'].values.reshape(-1,1)
        vali_g1 = vali_df['G1'].values.reshape(-1,1)
        vali_g2 = vali_df['G2'].values.reshape(-1,1)
        vali_abs = vali_df['absences'].values.reshape(-1,1)
        vali_df_dropped = vali_df.drop(['Mjob', 'Fjob', 'G1', 'G2', 'G3', 'age', 'absences'], axis=1)

        vali_x = vali_df_dropped.values
        vali_x = enc.transform(vali_x).toarray()
        vali_x = np.hstack((vali_x, vali_age, vali_abs, vali_g1, vali_g2))

        # Train the model
        coefs = en_train(train_x, train_y, alpha=blend, lamb=alpha, max_iter=1000)

        # Test on the validation set
        vali_predict = en_predict(vali_x, coefs)

        # Track performance
        cur_mse.append(metrics.mean_squared_error(vali_y, vali_predict))
        cur_r2.append(metrics.r2_score(vali_y, vali_predict))
        cur_ar2.append(metrics.r2_score)
        
    en_results[(alpha, blend)] = [np.mean(cur_mse), np.mean(cur_r2), np.mean(cur_arw)]

print("Used {} seconds".format(time.time() - start))
en_results

Used 637.6046061515808 seconds


{(1000, 0.1): [5.1898305502762145, 0.7586019007947982],
 (1000, 0.2): [5.105914220510753, 0.7627299959476231],
 (1000, 0.3): [5.024102872294314, 0.7667646828035627],
 (1000, 0.4): [4.953429297465515, 0.7703185616662538],
 (1000, 0.5): [4.891185673341892, 0.7736051249593453],
 (1000, 0.6): [4.833446267945211, 0.7768344963341587],
 (1000, 0.7): [4.758433210623304, 0.7808786522764268],
 (1000, 0.8): [4.661771911494286, 0.7860082579032402],
 (1000, 0.9): [4.544985472281258, 0.7921566499035165],
 (100, 0.1): [3.9528397950774647, 0.8171214292971956],
 (100, 0.2): [3.9699583282328192, 0.8165608684261318],
 (100, 0.3): [3.974569171571474, 0.8164010100154775],
 (100, 0.4): [3.966127889009273, 0.8169503125629249],
 (100, 0.5): [3.962602742983728, 0.8172942595863845],
 (100, 0.6): [3.9600923731631115, 0.8175944395849587],
 (100, 0.7): [3.9588479866542103, 0.8178387593855666],
 (100, 0.8): [3.9591899737606733, 0.8180117249493281],
 (100, 0.9): [3.961523924044492, 0.8180936127991492],
 (10, 0.1): [

In [59]:
df_dropped = df.drop(['Mjob', 'Fjob', 'G1', 'G2', 'G3', 'age', 'absences'], axis=1)
enc = OneHotEncoder(handle_unknown='error')
df_array = df_dropped.values
enc.fit(df_array)

# Training set
train_df = pd.concat([df_1, df_2, df_3, df_4, df_5])
train_y = train_df['G3'].values
train_age = train_df['age'].values.reshape(-1,1)
train_g1 = train_df['G1'].values.reshape(-1,1)
train_g2 = train_df['G2'].values.reshape(-1,1)
train_abs = train_df['absences'].values.reshape(-1,1)
train_df_dropped = train_df.drop(['Mjob', 'Fjob', 'G1', 'G2', 'G3', 'age', 'absences'], axis=1)

train_x = train_df_dropped.values
train_x = enc.transform(train_x).toarray()
train_x = np.hstack((train_x, train_age, train_abs, train_g1, train_g2))

# Test set
test_df = pd.read_csv('./test_set.csv')
test_y = test_df['G3'].values
test_age = test_df['age'].values.reshape(-1,1)
test_g1 = test_df['G1'].values.reshape(-1,1)
test_g2 = test_df['G2'].values.reshape(-1,1)
test_abs = test_df['absences'].values.reshape(-1,1)
test_df_dropped = test_df.drop(['Mjob', 'Fjob', 'G1', 'G2', 'G3', 'age', 'absences'], axis=1)

test_x = test_df_dropped.values
test_x = enc.transform(test_x).toarray()
test_x = np.hstack((test_x, test_age, test_abs, test_g1, test_g2))

In [131]:
(coefs, coef_track) = en_train(train_x, train_y, alpha=0.8, lamb=100, max_iter=1000)
predict_y = en_predict(test_x, coefs)
print(metrics.mean_squared_error(test_y, predict_y),
      metrics.r2_score(test_y, predict_y))

2.7910014431501344 0.862983449698729


In [138]:
my_dict = dict(enumerate(coef_track))

In [140]:
ddf = pd.DataFrame(my_dict)

In [142]:
ddf.to_csv("./en_coef_change.csv", index=False)

In [100]:
df.head(8)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
5,GP,M,16,U,LE3,T,4,3,services,other,...,5,4,2,1,2,5,10,15,15,15
6,GP,M,16,U,LE3,T,2,2,other,other,...,4,4,4,1,1,3,0,12,12,11
7,GP,F,17,U,GT3,A,4,4,other,teacher,...,4,1,4,1,1,1,6,6,5,6


In [109]:
en_results

{(1000, 0.1): [5.1898305502762145, 0.7586019007947982],
 (1000, 0.2): [5.105914220510753, 0.7627299959476231],
 (1000, 0.3): [5.024102872294314, 0.7667646828035627],
 (1000, 0.4): [4.953429297465515, 0.7703185616662538],
 (1000, 0.5): [4.891185673341892, 0.7736051249593453],
 (1000, 0.6): [4.833446267945211, 0.7768344963341587],
 (1000, 0.7): [4.758433210623304, 0.7808786522764268],
 (1000, 0.8): [4.661771911494286, 0.7860082579032402],
 (1000, 0.9): [4.544985472281258, 0.7921566499035165],
 (100, 0.1): [3.9528397950774647, 0.8171214292971956],
 (100, 0.2): [3.9699583282328192, 0.8165608684261318],
 (100, 0.3): [3.974569171571474, 0.8164010100154775],
 (100, 0.4): [3.966127889009273, 0.8169503125629249],
 (100, 0.5): [3.962602742983728, 0.8172942595863845],
 (100, 0.6): [3.9600923731631115, 0.8175944395849587],
 (100, 0.7): [3.9588479866542103, 0.8178387593855666],
 (100, 0.8): [3.9591899737606733, 0.8180117249493281],
 (100, 0.9): [3.961523924044492, 0.8180936127991492],
 (10, 0.1): [

In [121]:
row_1, row_2, row_3, row_4, row_5 = [], [], [], [], []
for key in en_results:
    row_1.append(key[0])
    row_2.append(key[1])
    row_3.append(en_results[key][0])
    row_4.append(en_results[key][1])
    r2 = en_results[key][1]
    ar2 = 1 - (1 - r2) * (79 - 1) / (79 - 89 - 1)
    row_5.append(ar2)

ddf = pd.DataFrame({'lambda': row_1, 'alpha': row_2, 'mse': row_3, 'r2': row_4, 'ar2': row_5})

In [113]:
ddf.to_csv('./en_cv_result.csv', index=False)

In [116]:
test_x.shape

(79, 89)