# Regression - Problem 1

In [None]:
import numpy as np

In [None]:
# load Training Set
x_train = np.load("Xtrain_Regression_Part1.npy")
y_train = np.load("Ytrain_Regression_Part1.npy")
x_test = np.load("Xtest_Regression_Part1.npy")

## Study Implementation

## Linear Predictor

In [None]:
# remember X = [1 x.T]
ones = np.ones((1,len(x_train)))
X = np.hstack((ones.T ,x_train))

In [None]:
# Beta = (X.T * X)^-1 * X.T * y
Beta_hat = np.matmul(
                np.matmul(
                    np.linalg.inv(
                        np.matmul(
                            X.T
                            ,X)
                    ),
                    X.T)
                ,y_train)

# print("Beta_hat", Beta_hat)

## Sum Squared Errors

In [None]:
SSE = np.matmul(
    (y_train-np.matmul(X, Beta_hat)).T,
    (y_train-np.matmul(X, Beta_hat))
)
print("SSE", SSE)
# mse
print("MSE", SSE/(len(y_train)))

## Model Class

In [None]:
class LinearPredictor():
    def train(self, x_train, y_train):
        """
        Train the model and set Beta_hat
        """
        ones = np.ones((1,len(x_train)))
        X = np.hstack((ones.T ,x_train))
        
        # self.Beta_hat = np.matmul(np.matmul(np.linalg.inv(np.matmul(X.T,X)),X.T),y_train)
        self.Beta_hat = np.matmul(
                np.matmul(
                    np.linalg.inv(
                        np.matmul(
                            X.T
                            ,X)
                    ),
                    X.T)
                ,y_train)

        return self
    
    def perf(self, x_test, y_test):
        """
        Calc sum of squares error for test set
        Set SSE
        """
        ones = np.ones((1,len(x_test)))
        X = np.hstack((ones.T ,x_test))

        self.SSE = np.matmul(
            (y_test-np.matmul(X, self.Beta_hat)).T,
            (y_test-np.matmul(X, self.Beta_hat))
        )
        return self.SSE
    
    def predict(self, x_0):
        X_0 = np.hstack((np.ones(1).T ,x_0))
        y_0 = np.matmul(self.Beta_hat.T, X_0)
        return y_0
    
    def __call__(self, x_0):
        return self.predict(x_0)

## Cross Validation

In [None]:
p = np.zeros(10)

x = x_train.copy()
y = y_train.copy()

for n in range(10):
    # rotate
    x = np.vstack((x[-10:], x[0:-10]))
    y = np.vstack((y[-10:], y[0:-10]))
    
    x_train_n = x[0: -10]
    y_train_n = y[0: -10]
    
    x_validation_n = x[-10:]
    y_validation_n = y[-10:]
    
    f_n = LinearPredictor()
    f_n.train(x_train_n, y_train_n)
    p[n] = f_n.perf(x_validation_n, y_validation_n)
    print("P_", n ,p[n])

print("avg(P)", np.average(p))
f = LinearPredictor()
f.train(x_train, y_train)
print("f.Beta_hat", f.Beta_hat)
y_0 = f.predict(x_test[0])
print("f(x_test[0])", y_0)

## Scikit-learn

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso

In [None]:
lin = LinearRegression().fit(x_train, y_train)

### Linerar

In [None]:
lin = LinearRegression().fit(x_train, y_train)
s = lin.score(x_train, y_train)
print("Linear", s)

### Ridge

In [None]:
from sklearn.linear_model import Ridge

In [None]:
for a in range(100):
    rid = Ridge(alpha=a).fit(x_train, y_train)
    s = rid.score(x_train, y_train)
    print("Ridge", a, s)

### Lasso

In [None]:
from sklearn.linear_model import Lasso

for a in range(100):
    lass = Lasso(alpha=a).fit(x_train, y_train)
    s = lass.score(x_train, y_train)
    print("Lasso", a, s)

# Validation Metrics

## Root mean squared error

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
import sklearn.metrics as metrics

# print("The data shape of X is {}".format(x_train))
# print("The number of feature in this data is {}".format(x_train.shape[1]))

train_x, test_x, train_y, test_y = train_test_split(x_train,
                                                    y_train,
                                                    test_size=0.2,
                                                    random_state=23)

# print("The first five samples {}".format(train_x[:5]))
# print("The first five targets {}".format(train_y[:5]))
# print("The number of samples in train set is {}".format(train_x.shape[0]))
# print("The number of samples in test set is {}".format(test_x.shape[0]))

lr = LinearRegression()
lr.fit(train_x, train_y)

pred_y = lr.predict(test_x)
# print("The first five prediction {}".format(pred_y[:5]))
# print("The real first five labels {}".format(test_y[:5]))

nrme = -1*np.sqrt(metrics.mean_squared_error(test_y, pred_y))

r2_score = metrics.r2_score(test_y, pred_y)
print("Linear -->","R2 score:", r2_score)

# Export
# lr.coef_

## R2 score

In [None]:
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

for a in np.linspace(0.01,0.4,10):
    linear = linear_model.LinearRegression()
    ridge = linear_model.Ridge(alpha=a)
    lasso = linear_model.Lasso(alpha=a)
    # modelos com valores normalizados
    # linear = make_pipeline(StandardScaler(with_mean=False),linear_model.LinearRegression())
    # ridge = make_pipeline(StandardScaler(with_mean=False), linear_model.Ridge(alpha=a))
    # lasso = make_pipeline(StandardScaler(with_mean=False), linear_model.Lasso(alpha=a))
    
    lin_scores = cross_val_score(linear, x_train, y_train, cv=10,
                        scoring='r2')
    ridge_scores = cross_val_score(ridge, x_train, y_train, cv=10,
                        scoring='r2')
    lasso_scores = cross_val_score(lasso, x_train, y_train, cv=10,
                        scoring='r2')
    print("Linear -->      ", "      ","R2 score:", np.average(lin_scores))
    print("Ridge  --> Alfa:", "{:.4f}".format(a),"R2 score:", np.average(ridge_scores))
    print("Lasso  --> Alfa:", "{:.4f}".format(a),"R2 score:", np.average(lasso_scores))
