# Regressao Linear
* Inclui código para implementar LASSO, RIDGE e ELASTICNET

In [7]:
from sklearn import linear_model
import pandas as pd
import numpy as np
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold, cross_val_score, LeaveOneOut, cross_val_predict, train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
# from sklearn.cross_validation import cross_val_score
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import ElasticNet
from sklearn import datasets
import math

In [8]:
boston = datasets.load_boston()

dados = pd.DataFrame(boston.data)
dados.columns = boston.feature_names
dados['PRICE'] = boston.target

X = dados.drop('PRICE',axis=1)
Y = dados['PRICE']

X.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [9]:
# Reading the data
folderName = '../data/' 
fileName   = 'yacht_hydrodynamics.data'

#Attribute Information:

# Variations concern hull geometry coefficients and the Froude number: 

# 1. Longitudinal position of the center of buoyancy, adimensional. 
# 2. Prismatic coefficient, adimensional. 
# 3. Length-displacement ratio, adimensional. 
# 4. Beam-draught ratio, adimensional. 
# 5. Length-beam ratio, adimensional. 
# 6. Froude number, adimensional. 

# The measured variable is the residuary resistance per unit weight of displacement: 

# 7. Residuary resistance per unit weight of displacement, adimensional. 

header = ['V_{}'.format(i) for i in range(7)]
df = pd.read_csv(folderName+fileName,sep='\s+',names=header)

# Printing some info about the data
print("[INFO]\nNrow: {}\nNcol: {}".format(df.shape[0],df.shape[1]))
df.head(3)

[INFO]
Nrow: 308
Ncol: 7


Unnamed: 0,V_0,V_1,V_2,V_3,V_4,V_5,V_6
0,-2.3,0.568,4.78,3.99,3.17,0.125,0.11
1,-2.3,0.568,4.78,3.99,3.17,0.15,0.27
2,-2.3,0.568,4.78,3.99,3.17,0.175,0.47


In [10]:
# Setting the target column
Y = df.loc[:,df.columns[-1]].values

# If necessary, dropping cols
cols = df.columns
X = df.drop(labels=cols[-1],axis=1)

In [11]:
LM = linear_model.LinearRegression(fit_intercept=True, 
                                   normalize=False, 
                                   copy_X=True, 
                                   n_jobs=1)


LM.fit(X,Y)
y_pred = LM.predict(X)

MSE = mean_squared_error(Y, y_pred)
RMSE = np.sqrt(MSE)
MAE = mean_absolute_error(Y,y_pred)
score = r2_score(Y,y_pred)

#print("Coeficientes:",LM.coef_)
#print("Intercepto: {:.2f}.".format(LM.intercept_))
print("MSE: {:.4f}.".format(MSE))
print("RMSE: {:.4f}.".format(RMSE))
print("MAE: {:.4f}.".format(MAE))
print("R2: {:.4f}.".format(score))

MSE: 78.4501.
RMSE: 8.8572.
MAE: 7.1603.
R2: 0.6576.


In [12]:
# Holdout

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42)

LM.fit(X_train, y_train)

pred_train = LM.predict(X_train)
pred_test = LM.predict(X_test)

mse_treino = mean_squared_error(y_train, pred_train)
mse_teste = mean_squared_error(y_test, pred_test)
    
rmse_treino = math.sqrt(mean_squared_error(y_train, pred_train))
rmse_teste = math.sqrt(mean_squared_error(y_test, pred_test))

mae_treino= mean_absolute_error(y_train, pred_train)
mae_teste = mean_absolute_error(y_test, pred_test)
        
r2_treino = r2_score(y_train, pred_train)
r2_teste = r2_score(y_test, pred_test)

In [13]:
print('MSE treino: %.4f' % np.array(mse_treino).mean())
print('MSE teste: %.4f' % np.array(mse_teste).mean())
print('RMSE treino: %.4f' % np.array(rmse_treino).mean())
print('RMSE teste: %.4f' % np.array(rmse_teste).mean())
print('MAE treino: %.4f' % np.array(mae_treino).mean())
print('MAE teste: %.4f' % np.array(mae_teste).mean())
print('R2 treino: %.4f' % np.array(r2_treino).mean())
print('R2 teste: %.4f' % np.array(r2_teste).mean())

MSE treino: 79.5220
MSE teste: 77.7556
RMSE treino: 8.9175
RMSE teste: 8.8179
MAE treino: 7.3619
MAE teste: 7.2731
R2 treino: 0.6774
R2 teste: 0.5716


In [19]:
# Cross-Validation or Leave-One-Out
# Trocar k por len(X) caso utilizar leave-one-out
 
k = 10
cv = KFold(k)

#k = len(X)
#cv = LeaveOneOut()

mse_treino = []
mse_teste = []
rmse_treino = []
rmse_teste = []
mae_treino = []
mae_teste = []
r2_treino = []
r2_teste = []
mse_sum = 0

for train_index, test_index in cv.split(X):
    model = LM.fit(X.iloc[train_index],Y[train_index])
       
    pred_train = model.predict(X.iloc[train_index])
    pred_test = model.predict(X.iloc[test_index])

    y_train = Y[train_index]
    y_test = Y[test_index]
        
    mse_treino.append(mean_squared_error(y_train, pred_train))
    mse_teste.append(mean_squared_error(y_test, pred_test))
    
    rmse_treino.append(math.sqrt(mean_squared_error(y_train, pred_train)))
    # K-fold RMSE
    #rmse_teste.append(math.sqrt(mean_squared_error(y_test, pred_test)))
    mse = mean_squared_error(y_test, pred_test)
    mse_sum += mse
    
    mae_treino.append(mean_absolute_error(y_train, pred_train))
    mae_teste.append(mean_absolute_error(y_test, pred_test))
        
    r2_treino.append(r2_score(y_train, pred_train))
    r2_teste.append(r2_score(y_test, pred_test))

    
rmse_teste2 = math.sqrt(mse_sum/k)

In [20]:
print('MSE treino: %.4f' % np.array(mse_treino).mean())
print('MSE teste: %.4f' % np.array(mse_teste).mean())
print('RMSE treino: %.4f' % np.array(rmse_treino).mean())
#print('RMSE teste: %.4f' % np.array(rmse_teste).mean())
print('RMSE teste: %.4f' % rmse_teste2)
print('MAE treino: %.4f' % np.array(mae_treino).mean())
print('MAE teste: %.4f' % np.array(mae_teste).mean())
print('R2 treino: %.4f' % np.array(r2_treino).mean())
print('R2 teste: %.4f' % np.array(r2_teste).mean())

MSE treino: 277.6223
MSE teste: 278.4681
RMSE treino: 16.6619
RMSE teste: 16.6874
MAE treino: 9.2071
MAE teste: 9.2545
R2 treino: -0.2118
R2 teste: 0.0000


In [8]:
# LASSO

LM = linear_model.Lasso(alpha=1.2, 
                        fit_intercept=True, 
                        normalize=False, 
                        precompute=False, 
                        copy_X=True, 
                        max_iter=1000, 
                        tol=0.0001, 
                        warm_start=False, 
                        positive=False, 
                        random_state=42, 
                        selection='cyclic')


LM.fit(X,Y)
y_pred = LM.predict(X)

MSE = mean_squared_error(Y, y_pred)
RMSE = np.sqrt(MSE)
MAE = mean_absolute_error(Y,y_pred)
score = r2_score(Y,y_pred)

#print("Coeficientes:",LM.coef_)
#print("Intercepto: {:.2f}.".format(LM.intercept_))
print("MSE: {:.4f}.".format(MSE))
print("RMSE: {:.4f}.".format(RMSE))
print("MAE: {:.4f}.".format(MAE))
print("R2: {:.4f}.".format(score))

MSE: 28.3518.
RMSE: 5.3246.
MAE: 3.7217.
R2: 0.6642.


In [9]:
# RIDGE

LM = linear_model.Ridge(alpha=1.2, 
                        fit_intercept=True, 
                        normalize=False, 
                        copy_X=True,
                        max_iter=None, 
                        tol=0.001, 
                        solver='auto', 
                        random_state=42)


LM.fit(X,Y)
y_pred = LM.predict(X)

MSE = mean_squared_error(Y, y_pred)
RMSE = np.sqrt(MSE)
MAE = mean_absolute_error(Y,y_pred)
score = r2_score(Y,y_pred)

#print("Coeficientes:",LM.coef_)
#print("Intercepto: {:.2f}.".format(LM.intercept_))
print("MSE: {:.4f}.".format(MSE))
print("RMSE: {:.4f}.".format(RMSE))
print("MAE: {:.4f}.".format(MAE))
print("R2: {:.4f}.".format(score))

MSE: 22.0837.
RMSE: 4.6993.
MAE: 3.2685.
R2: 0.7384.


In [10]:
# ELASTIC-NET

LM = ElasticNet(alpha=1.0, 
                l1_ratio=0.5, 
                fit_intercept=True, 
                normalize=False, 
                precompute=False, 
                max_iter=1000, 
                copy_X=True, 
                tol=0.0001, 
                warm_start=False, 
                positive=False, 
                random_state=42, 
                selection='cyclic')


LM.fit(X,Y)
y_pred = LM.predict(X)

MSE = mean_squared_error(Y, y_pred)
RMSE = np.sqrt(MSE)
MAE = mean_absolute_error(Y,y_pred)
score = r2_score(Y,y_pred)

#print("Coeficientes:",LM.coef_)
#print("Intercepto: {:.2f}.".format(LM.intercept_))
print("MSE: {:.4f}.".format(MSE))
print("RMSE: {:.4f}.".format(RMSE))
print("MAE: {:.4f}.".format(MAE))
print("R2: {:.4f}.".format(score))

MSE: 26.4998.
RMSE: 5.1478.
MAE: 3.5929.
R2: 0.6861.
