In [11]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [12]:
X_train = pd.read_csv('data/preprocessed/solTrainXtrans_py.csv').values
X_test = pd.read_csv('data/preprocessed/solTestXtrans_py.csv').values

y_train = pd.read_csv('data/original/solTrainY.txt', sep='\s+').values.flatten()
y_test  = pd.read_csv('data/original/solTestY.txt', sep='\s+').values.flatten()


Implementando o OLS

In [13]:
def OLS(X_train, y_train):
    Beta = np.linalg.pinv(X_train.T @ X_train) @ X_train.T @ y_train

    return Beta

In [14]:
# Para Calcular os valores preditores:
X_train_new = np.c_[np.ones((X_train.shape[0], 1)), X_train]

X_test_new = np.c_[np.ones((X_test.shape[0], 1)), X_test]

B_Chapeu = OLS(X_train_new, y_train)

y_predict = X_test_new @ B_Chapeu

y_predict

array([ 1.13656004e+00,  2.63763210e-02, -3.87234499e-01,  1.02979072e+00,
       -2.16771084e-01,  1.53759578e+00,  7.07452677e-01,  8.99405217e-01,
        2.89154031e-01, -5.28659122e-01, -3.62597414e-01, -9.70555804e-01,
        2.04087676e-02, -2.02037547e-01, -6.79368216e-01, -4.17821307e-01,
       -1.49623629e-01,  3.75887724e-01,  3.87078009e-01, -6.44964236e-01,
        5.68256183e-01,  2.89179819e-01, -7.00929390e-01,  2.40269031e-02,
       -9.88190200e-01,  1.21503324e-01, -7.42279340e-01,  1.26012108e+00,
       -1.98603201e+00, -1.10581023e+00, -2.58709443e+00, -9.67921572e-01,
       -6.76775481e-01, -1.44209682e-01, -8.45623408e-02, -1.27108641e+00,
        2.58724161e-01, -5.87408608e-01, -5.69394172e-03, -6.18591773e-01,
       -8.01166735e-01, -2.17929875e+00, -1.09172890e+00, -1.59081463e-01,
       -1.67209854e+00, -7.99278674e-01, -7.11684861e-01, -8.72236632e-01,
       -1.19216887e+00, -9.56905714e-01,  8.51590146e-01, -1.68812277e+00,
       -1.45826064e+00, -

Usando a função do scikit-learn que calcula o OLS para comparar com o método feito do zero 

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

In [16]:
linear_reg_scikitlearn = LinearRegression()
linear_reg_scikitlearn.fit(X_train, y_train)
y_predict_SL = linear_reg_scikitlearn.predict(X_test)

y_predict_SL

array([ 1.13656005e+00,  2.63763233e-02, -3.87234498e-01,  1.02979072e+00,
       -2.16771083e-01,  1.53759578e+00,  7.07452677e-01,  8.99405219e-01,
        2.89154032e-01, -5.28659120e-01, -3.62597414e-01, -9.70555805e-01,
        2.04087691e-02, -2.02037546e-01, -6.79368214e-01, -4.17821304e-01,
       -1.49623628e-01,  3.75887725e-01,  3.87078013e-01, -6.44964234e-01,
        5.68256185e-01,  2.89179822e-01, -7.00929389e-01,  2.40269030e-02,
       -9.88190198e-01,  1.21503325e-01, -7.42279339e-01,  1.26012108e+00,
       -1.98603201e+00, -1.10581023e+00, -2.58709442e+00, -9.67921570e-01,
       -6.76775479e-01, -1.44209680e-01, -8.45623386e-02, -1.27108641e+00,
        2.58724160e-01, -5.87408605e-01, -5.69394008e-03, -6.18591772e-01,
       -8.01166732e-01, -2.17929875e+00, -1.09172890e+00, -1.59081459e-01,
       -1.67209854e+00, -7.99278671e-01, -7.11684860e-01, -8.72236630e-01,
       -1.19216887e+00, -9.56905713e-01,  8.51590148e-01, -1.68812277e+00,
       -1.45826064e+00, -

Criando o cross validation

In [17]:
def cross_validation(K, X_train, y_train):
    erros_rmse = []
    r2_fold = []
    tam_passo = len(X_train) // K

    for i in range(K):
        # Organizando os índices da fatia
        inicio = i * tam_passo
        fim = (i+1) * tam_passo

        X_test_fold = X_train[inicio:fim]
        y_test_fold = y_train[inicio:fim]

        # Salvando os índices onde o conjunto de testes está, para depois remover os índices do conjunto e criar o conjunto de treinamento
        indices = np.arange(inicio, fim)

        # Criando os conjuntos de treinamento
        X_train_fold = np.delete(X_train, indices, axis=0)
        y_train_fold = np.delete(y_train, indices, axis=0)

        X_train_fold_new = np.c_[np.ones((X_train_fold.shape[0], 1)), X_train_fold]
        betas_fold = OLS(X_train_fold_new, y_train_fold)

        X_test_fold_new = np.c_[np.ones((X_test_fold.shape[0], 1)), X_test_fold]

        y_pred = X_test_fold_new @ betas_fold

        # Calculando o RMSE

        erro = y_test_fold - y_pred
        mse = np.mean(erro ** 2)
        rmse = np.sqrt(mse)

        erros_rmse.append(rmse)

        # Calculando o R^2 (Coeficiente de Determinação)

        sqr = np.sum((y_test_fold - y_pred) ** 2)
        sqt = np.sum((y_test_fold - np.mean(y_test_fold)) ** 2)
        r2 = 1 - (sqr / sqt)

        r2_fold.append(r2)
        
    
    media_RMSE = np.mean(erros_rmse)
    media_Rsquared = np.mean(r2_fold)

    return media_RMSE, media_Rsquared


In [18]:
media_rmse, media_Rsquared = cross_validation(5, X_train, y_train)

print(media_rmse)

print(media_Rsquared)

0.8251970936348603
0.25611450215590076


Cross-Validation com scikit-learn

RMSE

In [19]:
# Cross-Validation com k = 5 (Calculando o RMSE)
cross_validation_RMSE_sk_5 = cross_val_score(linear_reg_scikitlearn, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')

media_rmse_sklearn_5 = -1 * cross_validation_RMSE_sk_5.mean() # sklearn devolve o valor negativo

print(media_rmse_sklearn_5)

# Cross-Validation com k = 10 (Calculando o RMSE)
cross_validation_RMSE_sk_10 = cross_val_score(linear_reg_scikitlearn, X_train, y_train, cv=10, scoring='neg_root_mean_squared_error')

media_rmse_sklearn_10 = -1 * cross_validation_RMSE_sk_10.mean() # sklearn devolve o valor negativo

print(media_rmse_sklearn_10)


0.825707982087688
0.7618310852432757


R-SQUARED

In [20]:
# Cross-Validation com k = 5 (Calculando o R-squared)
cross_validation_Rsquared_sk_5 = cross_val_score(linear_reg_scikitlearn, X_train, y_train, cv=5, scoring='r2')

media_r2_sklearn = cross_validation_Rsquared_sk_5.mean()

media_r2_sklearn



0.25641884167318574