### Caracterização e visualização de dados

In [26]:
import pandas as pd
import numpy as np
import math
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures,OneHotEncoder
from matplotlib import pyplot as plt
from category_encoders.one_hot import OrdinalEncoder
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error

In [27]:
newDataframe = pd.read_csv("test_categorical_variables.csv")

#y = newDataframe['SalePrice']
y = pd.DataFrame(newDataframe['SalePrice'])
X = newDataframe.drop(['SalePrice'], axis=1)

y[["SalePrice"]] = y[["SalePrice"]].apply(np.log)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Validação cruzada

In [28]:
def avalia_classificador(clf, kf, X, y, f_metrica):
    metrica_valid = []
    metrica_train = []
    r2_valid = []
    r2_train = []
    
    y_preds = np.zeros(X.shape[0])
    
    for train, valid in kf.split(X,y):
        x_train = X.iloc[train] 
        y_train = y.iloc[train]
        x_valid = X.iloc[valid] 
        y_valid = y.iloc[valid]
        clf.fit(x_train, y_train) 
        y_pred_train = clf.predict(x_train)
        y_pred_valid = clf.predict(x_valid)
        y_preds[valid] = y_pred_valid[0] 
        
        metrica_valid.append(f_metrica(y_valid, y_pred_valid)) 
        metrica_train.append(f_metrica(y_train, y_pred_train))
        r2_valid.append(r2_score(y_valid, y_pred_valid))
        r2_train.append(r2_score(y_train, y_pred_train))
          
    return y_preds, np.array(metrica_valid).mean(), np.array(metrica_train).mean(), np.array(r2_valid).mean(), np.array(r2_train).mean()

### Regressão Linear

In [29]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score 
from sklearn.model_selection import KFold

regressor_linear = LinearRegression(fit_intercept = True)

regressor_linear = regressor_linear.fit(X_train, y_train)

y_resposta_treino = regressor_linear.predict(X_train)
y_resposta_teste  = regressor_linear.predict(X_test)

print(' ')
print(' REGRESSOR LINEAR:')
print(' ')

print(' Métrica         test             treino    ')
print(' -------  -----------------  ---------------')

rmse_in = math.sqrt(mse_in)
r2_in   = r2_score(y_train,y_resposta_treino)

rmse_out = math.sqrt(mse_out)
r2_out   = r2_score(y_test,y_resposta_teste)

print(' %7s  %17.4f  %15.4f' % ( 'rmse' , rmse_in , rmse_out ) )
print(' %7s  %17.4f  %15.4f' % (   'r2' ,   r2_in ,   r2_out ) )

kf = KFold(n_splits=9, shuffle=True, random_state=7)

preds, rmse_val, rmse_train, r2test, r2train = avalia_classificador(regressor_linear, kf, X, y, f_rmse)


print(' ')
print(' VALIDAÇÃO CRUZADA:')
print(' ')

print(' Métrica         test             treino    ')
print(' -------  -----------------  ---------------')

print(' %7s  %17.4f  %15.4f' % ( 'rmse' , rmse_train , rmse_val ) )
print(' %7s  %17.4f  %15.4f' % (   'r2' ,   r2train ,   r2test ) )

 
 REGRESSOR LINEAR:
 
 Métrica         test             treino    
 -------  -----------------  ---------------
    rmse             0.0082           0.0109
      r2             0.9376           0.8980
 
 VALIDAÇÃO CRUZADA:
 
 Métrica         test             treino    
 -------  -----------------  ---------------
    rmse             0.0083           0.0123
      r2             0.9371           0.8500


### Regressão Logística

In [30]:
#Não se aplica

### Regressão Bayesiana

In [31]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score 
from sklearn.linear_model import BayesianRidge, LinearRegression


bayesian_ridge = BayesianRidge(compute_score=True)

bayesian_ridge = bayesian_ridge.fit(X_train, y_train.values.ravel())

y_resposta_treino = bayesian_ridge.predict(X_train)
y_resposta_teste  = bayesian_ridge.predict(X_test)

print(' ')
print(' REGRESSÃO BAYESIANA:')
print(' ')

print(' Métrica         test             treino    ')
print(' -------  -----------------  ---------------')

mse_in  = mean_squared_error(y_train,y_resposta_treino)
rmse_in = math.sqrt(mse_in)
r2_in   = r2_score(y_train,y_resposta_treino)
#MAPE_in = np.mean(np.abs((y_train - y_resposta_treino) / y_train)) * 100
RMSLE_in = mean_squared_log_error(y_train,y_resposta_treino)

mse_out  = mean_squared_error(y_test,y_resposta_teste)
rmse_out = math.sqrt(mse_out)
r2_out   = r2_score(y_test,y_resposta_teste)
#MAPE_out = np.mean(np.abs((y_test - y_resposta_teste) / y_test)) * 100
RMSLE_out = mean_squared_log_error(y_test,y_resposta_teste)

mape_test = MAPE_score(y_resposta_teste, y_test)
mape_train = MAPE_score(y_resposta_treino, y_train)

print(' %7s  %17.4f  %15.4f' % (  'mse' ,  mse_in ,  mse_out ) )
print(' %7s  %17.4f  %15.4f' % ( 'rmse' , rmse_in , rmse_out ) )
print(' %7s  %17.4f  %15.4f' % (   'r2' ,   r2_in ,   r2_out ) )
print(' %7s  %17.4f  %15.4f' % (   'RMSLE' ,   RMSLE_in ,   RMSLE_out ) )

kf = KFold(n_splits=9, shuffle=True, random_state=7)

preds, rmse_val, rmse_train, r2test, r2train = avalia_classificador(bayesian_ridge, kf, X, y, f_rmse)

print(' ')
print(' VALIDAÇÃO CRUZADA:')
print(' ')

print(' Métrica         test             treino    ')
print(' -------  -----------------  ---------------')

print(' %7s  %17.4f  %15.4f' % ( 'rmse' , rmse_train , rmse_val ) )
print(' %7s  %17.4f  %15.4f' % (   'r2' ,   r2train ,   r2test ) )

 
 REGRESSÃO BAYESIANA:
 
 Métrica         test             treino    
 -------  -----------------  ---------------


NameError: name 'check_arrays' is not defined

### Árvores de decisão

In [32]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
#from sklearn.metrics import mean_squared_error, r2_score 
from sklearn import metrics
#import math


# Create Decision Tree classifer object
clf = DecisionTreeRegressor()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_resposta_treino = clf.predict(X_train)
y_resposta_teste = clf.predict(X_test)

print(' ')
print(' ARVORE DE DECISÃO:')
print(' ')

print(' Métrica         test             treino    ')
print(' -------  -----------------  ---------------')

mse_in  = mean_squared_error(y_train,y_resposta_treino)
rmse_in = math.sqrt(mse_in)
r2_in   = r2_score(y_train,y_resposta_treino)

mse_out  = mean_squared_error(y_test,y_resposta_teste)
rmse_out = math.sqrt(mse_out)
r2_out   = r2_score(y_test,y_resposta_teste)

print(' %7s  %17.4f  %15.4f' % ( 'rmse' , rmse_in , rmse_out ) )
print(' %7s  %17.4f  %15.4f' % (   'r2' ,   r2_in ,   r2_out ) )

kf = KFold(n_splits=9, shuffle=True, random_state=7)

preds, rmse_val, rmse_train, r2test, r2train = avalia_classificador(clf, kf, X, y, f_rmse)

print(' ')
print(' VALIDAÇÃO CRUZADA:')
print(' ')

print(' Métrica         test             treino    ')
print(' -------  -----------------  ---------------')

print(' %7s  %17.4f  %15.4f' % ( 'rmse' , rmse_train , rmse_val ) )
print(' %7s  %17.4f  %15.4f' % (   'r2' ,   r2train ,   r2test ) )

 
 ARVORE DE DECISÃO:
 
 Métrica         test             treino    
 -------  -----------------  ---------------
     mse             0.0000           0.0003
    rmse             0.0000           0.0168
      r2             1.0000           0.7593
   RMSLE             0.0000           0.0000
 
 VALIDAÇÃO CRUZADA:
 
 Métrica         test             treino    
 -------  -----------------  ---------------
    rmse             0.0000           0.0170
      r2             1.0000           0.7275


### Random Forest

In [29]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
#import math

# Create Decision Tree classifer object
clf = RandomForestRegressor()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train.values.ravel())

#Predict the response for test dataset
y_resposta_treino = clf.predict(X_train)
y_resposta_teste = clf.predict(X_test)

print(' ')
print(' RANDOM FOREST:')
print(' ')

print(' Métrica         test             treino    ')
print(' -------  -----------------  ---------------')

mse_in  = mean_squared_error(y_train,y_resposta_treino)
rmse_in = math.sqrt(mse_in)
r2_in   = r2_score(y_train,y_resposta_treino)

mse_out  = mean_squared_error(y_test,y_resposta_teste)
rmse_out = math.sqrt(mse_out)
r2_out   = r2_score(y_test,y_resposta_teste)

print(' %7s  %17.4f  %15.4f' % ( 'rmse' , rmse_in , rmse_out ) )
print(' %7s  %17.4f  %15.4f' % (   'r2' ,   r2_in ,   r2_out ) )

kf = KFold(n_splits=9, shuffle=True, random_state=7)

preds, rmse_val, rmse_train, r2test, r2train = avalia_classificador(clf, kf, X, y, f_rmse)

print(' ')
print(' VALIDAÇÃO CRUZADA:')
print(' ')

print(' Métrica         test             treino    ')
print(' -------  -----------------  ---------------')

print(' %7s  %17.4f  %15.4f' % ( 'rmse' , rmse_train , rmse_val ) )
print(' %7s  %17.4f  %15.4f' % (   'r2' ,   r2train ,   r2test ) )

 
 RANDOM FOREST:
 
 Métrica  DENTRO da amostra  FORA da amostra
 -------  -----------------  ---------------
     mse             0.0000           0.0001
    rmse             0.0048           0.0118
      r2             0.9788           0.8811


  clf.fit(x_train, y_train)
  clf.fit(x_train, y_train)
  clf.fit(x_train, y_train)
  clf.fit(x_train, y_train)
  clf.fit(x_train, y_train)
  clf.fit(x_train, y_train)
  clf.fit(x_train, y_train)
  clf.fit(x_train, y_train)
  clf.fit(x_train, y_train)


 
 VALIDAÇÃO CRUZADA:
 
 Métrica  DENTRO da amostra  FORA da amostra
 -------  -----------------  ---------------
    rmse             0.0045           0.0119
      r2             0.9817           0.8677


### Gradient Boosting

In [30]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import metrics
#import math

# Create Decision Tree classifer object
clf = GradientBoostingRegressor(random_state=0)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train.values.ravel())

#Predict the response for test dataset
y_resposta_treino = clf.predict(X_train)
y_resposta_teste = clf.predict(X_test)

print(' ')
print(' GRADIENT BOOSTING:')
print(' ')

print(' Métrica         test             treino    ')
print(' -------  -----------------  ---------------')

mse_in  = mean_squared_error(y_train,y_resposta_treino)
rmse_in = math.sqrt(mse_in)
r2_in   = r2_score(y_train,y_resposta_treino)

mse_out  = mean_squared_error(y_test,y_resposta_teste)
rmse_out = math.sqrt(mse_out)
r2_out   = r2_score(y_test,y_resposta_teste)

print(' %7s  %17.4f  %15.4f' % ( 'rmse' , rmse_in , rmse_out ) )
print(' %7s  %17.4f  %15.4f' % (   'r2' ,   r2_in ,   r2_out ) )

kf = KFold(n_splits=9, shuffle=True, random_state=7)

preds, rmse_val, rmse_train, r2test, r2train = avalia_classificador(clf, kf, X, y, f_rmse)

print(' ')
print(' VALIDAÇÃO CRUZADA:')
print(' ')

print(' Métrica         test             treino    ')
print(' -------  -----------------  ---------------')

print(' %7s  %17.4f  %15.4f' % ( 'rmse' , rmse_train , rmse_val ) )
print(' %7s  %17.4f  %15.4f' % (   'r2' ,   r2train ,   r2test ) )

 
 GRADIENT BOOSTING:
 
 Métrica  DENTRO da amostra  FORA da amostra
 -------  -----------------  ---------------
     mse             0.0000           0.0001
    rmse             0.0061           0.0107
      r2             0.9652           0.9020


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


 
 VALIDAÇÃO CRUZADA:
 
 Métrica  DENTRO da amostra  FORA da amostra
 -------  -----------------  ---------------
    rmse             0.0067           0.0107
      r2             0.9598           0.8917


### Support Vector Machines

In [100]:
from sklearn.svm import SVR
from sklearn import metrics
#import math

# Fit regression model
svr_rbf = SVR(kernel="rbf")
svr_lin = SVR(kernel="linear")
svr_poly = SVR(kernel="poly")

# Create Decision Tree classifer object
#clf = GradientBoostingRegressor(random_state=0)

# Train Decision Tree Classifer
svr_rbf = svr_rbf.fit(X_train,y_train.values.ravel())

#Predict the response for test dataset
y_resposta_treino = svr_rbf.predict(X_train)
y_resposta_teste = svr_rbf.predict(X_test)

print(' ')
print(' SUPPORT VECTOR MACHINES RBF KERNEL:')
print(' ')

print(' Métrica         test             treino    ')
print(' -------  -----------------  ---------------')

mse_in  = mean_squared_error(y_train,y_resposta_treino)
rmse_in = math.sqrt(mse_in)
r2_in   = r2_score(y_train,y_resposta_treino)

mse_out  = mean_squared_error(y_test,y_resposta_teste)
rmse_out = math.sqrt(mse_out)
r2_out   = r2_score(y_test,y_resposta_teste)

print(' %7s  %17.4f  %15.4f' % ( 'rmse' , rmse_in , rmse_out ) )
print(' %7s  %17.4f  %15.4f' % (   'r2' ,   r2_in ,   r2_out ) )

kf = KFold(n_splits=9, shuffle=True, random_state=7)

preds, rmse_val, rmse_train, r2test, r2train = avalia_classificador(svr_rbf, kf, X, y, f_rmse)

print(' ')
print(' VALIDAÇÃO CRUZADA:')
print(' ')

print(' Métrica         test             treino    ')
print(' -------  -----------------  ---------------')

print(' %7s  %17.4f  %15.4f' % ( 'rmse' , rmse_train , rmse_val ) )
print(' %7s  %17.4f  %15.4f' % (   'r2' ,   r2train ,   r2test ) )

#******************************************************************

# Train Decision Tree Classifer
svr_lin = svr_lin.fit(X_train,y_train.values.ravel())

#Predict the response for test dataset
y_resposta_treino = svr_lin.predict(X_train)
y_resposta_teste = svr_lin.predict(X_test)

print(' ')
print(' SUPPORT VECTOR MACHINES LINEAR KERNEL:')
print(' ')

print(' Métrica         test             treino    ')
print(' -------  -----------------  ---------------')

mse_in  = mean_squared_error(y_train,y_resposta_treino)
rmse_in = math.sqrt(mse_in)
r2_in   = r2_score(y_train,y_resposta_treino)

mse_out  = mean_squared_error(y_test,y_resposta_teste)
rmse_out = math.sqrt(mse_out)
r2_out   = r2_score(y_test,y_resposta_teste)

print(' %7s  %17.4f  %15.4f' % ( 'rmse' , rmse_in , rmse_out ) )
print(' %7s  %17.4f  %15.4f' % (   'r2' ,   r2_in ,   r2_out ) )

#***************************************************************

# Train Decision Tree Classifer
svr_poly = svr_poly.fit(X_train,y_train.values.ravel())

#Predict the response for test dataset
y_resposta_treino = svr_poly.predict(X_train)
y_resposta_teste = svr_poly.predict(X_test)

print(' ')
print(' SUPPORT VECTOR MACHINES POLYNOMIAL KERNEL:')
print(' ')

print(' Métrica         test             treino    ')
print(' -------  -----------------  ---------------')

mse_in  = mean_squared_error(y_train,y_resposta_treino)
rmse_in = math.sqrt(mse_in)
r2_in   = r2_score(y_train,y_resposta_treino)

mse_out  = mean_squared_error(y_test,y_resposta_teste)
rmse_out = math.sqrt(mse_out)
r2_out   = r2_score(y_test,y_resposta_teste)

print(' %7s  %17.4f  %15.4f' % ( 'rmse' , rmse_in , rmse_out ) )
print(' %7s  %17.4f  %15.4f' % (   'r2' ,   r2_in ,   r2_out ) )


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


 
 SUPPORT VECTOR MACHINES RBF KERNEL:
 
 Métrica  DENTRO da amostra  FORA da amostra
 -------  -----------------  ---------------
     mse             0.0011           0.0012
    rmse             0.0334           0.0339
      r2            -0.0398           0.0189
 
 VALIDAÇÃO CRUZADA:
 
 Métrica  DENTRO da amostra  FORA da amostra
 -------  -----------------  ---------------
    rmse             0.0348           0.0347
      r2            -0.0997          -0.1159
 
 SUPPORT VECTOR MACHINES LINEAR KERNEL:
 
 Métrica  DENTRO da amostra  FORA da amostra
 -------  -----------------  ---------------
     mse             0.0011           0.0012
    rmse             0.0335           0.0340
      r2            -0.0455           0.0132
 
 SUPPORT VECTOR MACHINES POLYNOMIAL KERNEL:
 
 Métrica  DENTRO da amostra  FORA da amostra
 -------  -----------------  ---------------
     mse             0.0014           0.0015
    rmse             0.0380           0.0381
      r2            -0.3448      

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


In [96]:
rbf = SVC(kernel='rbf', C=1.0)

rbf.fit(X_train, y_train)

y_pred_rbf1 = rbf.predict(X_test)

print('Acurácia de SVM RBF e C=1.0: {0:0.4f}'.format(accuracy_score(y_test, y_pred_rbf1)))

NameError: name 'SVC' is not defined

### Redes Neurais

In [38]:


from sklearn.model_selection import cross_validate, KFold

def validacao_cruzada(modelo):
  
  kf = KFold(n_splits=10)
  scores = cross_validate(modelo, X_train, y_train, cv=kf, scoring='r2', return_train_score=True)
  return scores


from sklearn.neural_network import MLPRegressor
#regr = MLPRegressor(random_state=64, max_iter=50000).fit(X_train, y_train.values.ravel())

regr = MLPRegressor(random_state=1234, solver='adam', activation='relu')

score1 = validacao_cruzada(regr)
#Predict the response for test dataset
y_resposta_treino = regr.predict(X_train)
y_resposta_teste = regr.predict(X_test)

print(' ')
print(' NEURAL NETWORK:')
print(' ')

print(' Métrica  DENTRO da amostra  FORA da amostra')
print(' -------  -----------------  ---------------')

mse_in  = mean_squared_error(y_train,y_resposta_treino)
rmse_in = math.sqrt(mse_in)
r2_in   = r2_score(y_train,y_resposta_treino)

mse_out  = mean_squared_error(y_test,y_resposta_teste)
rmse_out = math.sqrt(mse_out)
r2_out   = r2_score(y_test,y_resposta_teste)

print(' %7s  %17.4f  %15.4f' % (  'mse' ,  mse_in ,  mse_out ) )
print(' %7s  %17.4f  %15.4f' % ( 'rmse' , rmse_in , rmse_out ) )
print(' %7s  %17.4f  %15.4f' % (   'r2' ,   r2_in ,   r2_out ) )

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


NotFittedError: This MLPRegressor instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [26]:
# Redes Neurais