In [1]:
import numpy as np
import scipy
from sklearn.model_selection import train_test_split

In [2]:
def rms(preds, y_val):
    return np.sqrt(np.mean((preds - y_val)**2))

In [3]:
class RegularizedRegression:
    def __init__(self, regularization_params=1e-2):
        self.regularization_param = regularization_params
        
    
    def train(self, X_train, y_train):
        best_score = 9999
        X, x, Y, y = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
        if type(self.regularization_param) == list:
            for item in self.regularization_param:
                weights = np.linalg.inv(np.dot(X.T, X) + item*np.eye(X.shape[1])).dot(X.T).dot(Y)
                curr_err = rms(x.dot(weights), y)
                if curr_err < best_score:
                    best_score = curr_err 
                    self.regularization_param_best = item
                    self.weights = weights
        else:
            self.weights = np.linalg.inv(np.dot(X_train.T, X_train) + self.regularization_param).dot(X_train.T).dot(y_train)
                    
    def predict(self, X_test):
        return X_test.dot(self.weights)
    
    def get_score(self, X_val, y_val):
        preds = self.predict(X_val)
        return rms(preds, y_val)

In [4]:
import pandas as pd
data = pd.read_csv('./data/CASP.csv')
X_train, X_test, y_train, y_test = train_test_split(data.drop('RMSD', axis=1), data['RMSD'], train_size=330, random_state=42)

In [5]:
model = RegularizedRegression([10, 1, 0.1, 0.01, 0.001, 0.0001])
model.train(X_train, y_train)
print('test error for regularized regression is {}'.format(model.get_score(X_test, y_test)))

test error for regularized regression is 5.2597138992835735


In [6]:
from sklearn.linear_model import LinearRegression

In [7]:
model = LinearRegression()
model.fit(X_train, y_train)
print('test error for sklearn regression is {}'.format(rms(model.predict(X_test), y_test)))

test error for sklearn regression is 5.256953156344602


In [8]:
data1 = pd.read_csv('./data/slice_localization_data.csv')
X_train, X_test, y_train, y_test = train_test_split(data1.drop('reference', axis=1), data1['reference'], train_size=330, random_state=69)

In [9]:
model = RegularizedRegression([10, 1, 0.1, 0.01, 0.001, 0.0001])
model.train(X_train, y_train)
print('test error for regularized regression is {}'.format(model.get_score(X_test, y_test)))

test error for regularized regression is 10.604838465191966


In [10]:
model = LinearRegression()
model.fit(X_train, y_train)
print('test error for sklearn regression is {}'.format(rms(model.predict(X_test), y_test)))

test error for sklearn regression is 45.923593525667386
