In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
X_train = pd.read_csv('data/preprocessed/solTrainX_pca.csv').values
X_test = pd.read_csv('data/preprocessed/solTestX_pca.csv').values

y_train = pd.read_csv('data/original/solTrainY.txt', sep='\s+').values.flatten()
y_test  = pd.read_csv('data/original/solTestY.txt', sep='\s+').values.flatten()


Implementando o OLS

In [3]:
def OLS(X_train, y_train):
    Beta = np.linalg.inv(X_train.T @ X_train) @ X_train.T @ y_train

    return Beta

In [4]:
# Para Calcular os valores preditores:
X_train_new = np.c_[np.ones((X_train.shape[0], 1)), X_train]

X_test_new = np.c_[np.ones((X_test.shape[0], 1)), X_test]

B_Chapeu = OLS(X_train_new, y_train)

y_predict = X_test_new @ B_Chapeu

y_predict

array([ 0.43372061,  0.88729311, -0.9003554 ,  0.03437835,  0.07487596,
        1.1945748 ,  0.32212048,  0.48791248, -0.22404799, -1.21068466,
       -0.93576584, -1.85699062, -0.88396554,  0.13526097, -1.10577009,
       -1.90287596, -0.5703451 , -0.65655252,  0.67718718, -1.96975814,
       -0.48438324, -0.47694639, -0.47220582, -1.01272235, -0.23176852,
       -0.8985383 , -0.32932554,  0.46350344, -1.47164352, -1.20644387,
       -2.49391165, -0.61492414, -0.66660167, -0.09640863, -0.21336185,
       -0.30058296, -0.21321166, -0.89792965,  0.14306593, -1.5886409 ,
       -0.13136757, -2.73658481, -1.08590779,  0.2143639 , -2.1615218 ,
       -0.68652681, -0.82841448, -0.52192381, -1.14585564, -0.46467139,
        0.12170532, -1.42247862, -1.22527829, -0.90830638, -1.22003436,
       -1.56973643, -1.71165087, -1.10103523, -2.29968167, -3.04071989,
       -0.68254828, -1.44844867, -2.63219522, -0.96133472, -1.2308566 ,
       -2.51317285, -2.00996549, -1.91666139, -0.95911455, -2.77

Usando a função do scikit-learn que calcula o OLS para comparar com o método feito do zero 

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

In [6]:
linear_reg_scikitlearn = LinearRegression()
linear_reg_scikitlearn.fit(X_train, y_train)
y_predict_SL = linear_reg_scikitlearn.predict(X_test)

y_predict_SL

array([ 0.43372061,  0.88729311, -0.9003554 ,  0.03437835,  0.07487596,
        1.1945748 ,  0.32212048,  0.48791248, -0.22404799, -1.21068466,
       -0.93576584, -1.85699062, -0.88396554,  0.13526097, -1.10577009,
       -1.90287596, -0.5703451 , -0.65655252,  0.67718718, -1.96975814,
       -0.48438324, -0.47694639, -0.47220582, -1.01272235, -0.23176852,
       -0.8985383 , -0.32932554,  0.46350344, -1.47164352, -1.20644387,
       -2.49391165, -0.61492414, -0.66660167, -0.09640863, -0.21336185,
       -0.30058296, -0.21321166, -0.89792965,  0.14306593, -1.5886409 ,
       -0.13136757, -2.73658481, -1.08590779,  0.2143639 , -2.1615218 ,
       -0.68652681, -0.82841448, -0.52192381, -1.14585564, -0.46467139,
        0.12170532, -1.42247862, -1.22527829, -0.90830638, -1.22003436,
       -1.56973643, -1.71165087, -1.10103523, -2.29968167, -3.04071989,
       -0.68254828, -1.44844867, -2.63219522, -0.96133472, -1.2308566 ,
       -2.51317285, -2.00996549, -1.91666139, -0.95911455, -2.77

Criando o cross validation

In [7]:
def cross_validation(K, X_train, y_train):
    erros_rmse = []
    r2_fold = []
    tam_passo = len(X_train) // K

    for i in range(K):
        # Organizando os índices da fatia
        inicio = i * tam_passo
        fim = (i+1) * tam_passo

        X_test_fold = X_train[inicio:fim]
        y_test_fold = y_train[inicio:fim]

        # Salvando os índices onde o conjunto de testes está, para depois remover os índices do conjunto e criar o conjunto de treinamento
        indices = np.arange(inicio, fim)

        # Criando os conjuntos de treinamento
        X_train_fold = np.delete(X_train, indices, axis=0)
        y_train_fold = np.delete(y_train, indices, axis=0)

        X_train_fold_new = np.c_[np.ones((X_train_fold.shape[0], 1)), X_train_fold]
        betas_fold = OLS(X_train_fold_new, y_train_fold)

        X_test_fold_new = np.c_[np.ones((X_test_fold.shape[0], 1)), X_test_fold]

        y_pred = X_test_fold_new @ betas_fold

        # Calculando o RMSE

        erro = y_test_fold - y_pred
        mse = np.mean(erro ** 2)
        rmse = np.sqrt(mse)

        erros_rmse.append(rmse)

        # Calculando o R^2 (Coeficiente de Determinação)

        sqr = np.sum((y_test_fold - y_pred) ** 2)
        sqt = np.sum((y_test_fold - np.mean(y_test_fold)) ** 2)
        r2 = 1 - (sqr / sqt)

        r2_fold.append(r2)
        
    
    media_RMSE = np.mean(erros_rmse)
    media_Rsquared = np.mean(r2_fold)

    return media_RMSE, media_Rsquared


In [8]:
media_rmse, media_Rsquared = cross_validation(5, X_train, y_train)

print(media_rmse)

print(media_Rsquared)

0.8630967260992216
0.14450144464384987


Cross-Validation com scikit-learn

RMSE

In [9]:
# Cross-Validation com k = 5 (Calculando o RMSE)
cross_validation_RMSE_sk_5 = cross_val_score(linear_reg_scikitlearn, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')

media_rmse_sklearn_5 = -1 * cross_validation_RMSE_sk_5.mean() # sklearn devolve o valor negativo

print(media_rmse_sklearn_5)

# Cross-Validation com k = 10 (Calculando o RMSE)
cross_validation_RMSE_sk_10 = cross_val_score(linear_reg_scikitlearn, X_train, y_train, cv=10, scoring='neg_root_mean_squared_error')

media_rmse_sklearn_10 = -1 * cross_validation_RMSE_sk_10.mean() # sklearn devolve o valor negativo

print(media_rmse_sklearn_10)


0.8634468602724776
0.8386446503599071


R-SQUARED

In [10]:
# Cross-Validation com k = 5 (Calculando o R-squared)
cross_validation_Rsquared_sk_5 = cross_val_score(linear_reg_scikitlearn, X_train, y_train, cv=5, scoring='r2')

media_r2_sklearn = cross_validation_Rsquared_sk_5.mean()

media_r2_sklearn



0.14459989067752632