<a href="https://colab.research.google.com/github/eutiagovski/projetos-cursos/blob/main/datascience-mentorama/11_implementando_Modelos_exercicio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_friedman1, make_classification
from sklearn.base import BaseEstimator

In [63]:
# Funções dos exercícios

def getData():
  X, y = make_friedman1(n_samples=10000, n_features=5, noise=0.5, random_state=0)
  return X, y

def getData2():
  X, y = make_classification(n_classes=2, n_features=5, n_samples=10000, random_state=0)
  return X, y

# Classe regressão linear criada em aula

class regLinear(BaseEstimator):
  def __init__(self, learning_rate, num_steps):
    self.learning_rate = learning_rate
    self.num_steps = num_steps

  def fit(self, X, y):
    y = y.reshape(-1, 1)
    m = X.shape[0]
    k = X.shape[1]
    theta = np.random.randn(k + 1, 1)
    X_b = np.c_[np.ones((m, 1)), X]

    for step in range(self.num_steps):
      gradients = 2/m * X_b.T.dot(X_b.dot(theta) - y)
      theta = theta - self.learning_rate * gradients

    self.final_theta = theta
    print('Model trained')

  def predict(self, X):
    m = X.shape[0]
    X_b = np.c_[np.ones((m, 1)), X]
    preds = X_b.dot(self.final_theta)
    return preds.reshape(-1,)

## Exercício 1: Regressão Linear:

###Parte 1

1- Usando a função getData(), carregue os dados disponibilizados.

2- Separe parte dos dados para o dataset de teste.

3- Usando a metodologia de validação cruzada, teste diferentes parâmetros da regLinear - diferentes learning_rates e num_steps - para escolher a melhor combinação de parâmetros.

4- Implemente a regressão linear do scikit-learn e compare os resultados obtidos.


In [None]:
# Carregando os dados do exercicio

X, y = getData()
X.shape, y.shape

((10000, 5), (10000,))

In [None]:
# Seprando os dados em treino e teste

from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y)

In [None]:
Xtrain.shape, Xtest.shape

((7500, 5), (2500, 5))

In [None]:
## teste de caso:

lin_reg = regLinear(num_steps=1, learning_rate=0.25)
lin_reg.fit(Xtrain, ytrain)

ypred = lin_reg.predict(Xtest)
ypred, ytest

Model trained


(array([18.00688701, 18.85258444, 13.10094613, ..., 19.95638332,
        17.21678979, 12.38755867]),
 array([15.45130146, 17.53678595, 11.80968997, ..., 21.49622573,
        23.10941602,  6.7219856 ]))

In [None]:
from sklearn.metrics import mean_squared_error

lin_mse = mean_squared_error(ytest, ypred)
lin_mse

16.197130150685894

In [None]:
steps = [1, 3, 5, 10, 100, 200]
rates = [0.0025, 0.025, 0.5, 0.75, 1, 1.25]

for rate in rates:
 for step in steps:
    lin_reg = regLinear(num_steps=step, learning_rate=rate)
    lin_reg.fit(Xtrain, ytrain)

    ypred = lin_reg.predict(Xtrain)
    lin_mse = mean_squared_error(ytrain, ypred)

    print(f'Step: {step}')
    print(f'Learning Rate: {rate}')
    print(f'Mse: {mean_squared_error(ytrain, ypred)}')
    print()        

Model trained
Step: 1
Learning Rate: 0.0025
Mse: 309.4838759888438

Model trained
Step: 3
Learning Rate: 0.0025
Mse: 205.49146380994694

Model trained
Step: 5
Learning Rate: 0.0025
Mse: 245.8049687641348

Model trained
Step: 10
Learning Rate: 0.0025
Mse: 165.6926847456548

Model trained
Step: 100
Learning Rate: 0.0025
Mse: 29.898416271347134

Model trained
Step: 200
Learning Rate: 0.0025
Mse: 12.732022328216072

Model trained
Step: 1
Learning Rate: 0.025
Mse: 141.47931055576552

Model trained
Step: 3
Learning Rate: 0.025
Mse: 96.20595140201569

Model trained
Step: 5
Learning Rate: 0.025
Mse: 79.44128557388879

Model trained
Step: 10
Learning Rate: 0.025
Mse: 32.64070114057634

Model trained
Step: 100
Learning Rate: 0.025
Mse: 10.162098142399824

Model trained
Step: 200
Learning Rate: 0.025
Mse: 8.307114916190125

Model trained
Step: 1
Learning Rate: 0.5
Mse: 367.48462134149895

Model trained
Step: 3
Learning Rate: 0.5
Mse: 1293.9941828909894

Model trained
Step: 5
Learning Rate: 0.5
Ms

In [None]:
## Melhores parâmetros para o modelo

lin_reg = regLinear(num_steps=200, learning_rate=0.75)

lin_reg.fit(Xtrain, ytrain)
ypred = lin_reg.predict(Xtrain)

print(f'MSE: {mean_squared_error(ytrain, ypred)}')


Model trained
MSE: 1.2916552867579406e+157


In [None]:
from sklearn.model_selection import cross_val_score

def display_scores(scores):
  scores = np.sqrt(-scores)
  print('\nCross Val Scores: \n')
  print(f'Mean: {scores.mean()}')
  print(f'Std Deriv: {scores.std()}')
  print(f'Scores: {scores}')


lin_reg_score = cross_val_score(lin_reg, Xtrain, ytrain, scoring='neg_mean_squared_error', cv=10)
lin_reg_score = (display_scores(lin_reg_score))
lin_reg_score

Model trained
Model trained
Model trained
Model trained
Model trained
Model trained
Model trained
Model trained
Model trained
Model trained

Cross Val Scores: 

Mean: 3.280193110058267e+78
Std Deriv: 1.0810859544322886e+78
Scores: [3.84986333e+78 3.02129004e+78 3.06390867e+78 5.60377658e+78
 2.91782018e+78 3.93272849e+78 3.75498654e+78 1.56090776e+78
 1.82709515e+78 3.26955435e+78]


In [None]:
# Comparando com o modelo do sklearn

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

lin_reg.fit(Xtrain, ytrain)

ypred = lin_reg.predict(Xtrain)

print(f'MSE: {mean_squared_error(ytrain, ypred)}')


MSE: 6.029690122877917


In [None]:
lin_reg_score_sk = cross_val_score(lin_reg, Xtrain, ytrain, scoring='neg_mean_squared_error', cv=10)
lin_reg_score_sk = (display_scores(lin_reg_score_sk))
lin_reg_score_sk


Cross Val Scores: 

Mean: 2.456534270989073
Std Deriv: 0.06368699377445314
Scores: [2.47657636 2.37781191 2.55254157 2.53673673 2.37051872 2.43215304
 2.46349076 2.37140731 2.5078694  2.47623693]


In [None]:
# Validando no conjunto de testes:

lin_reg = regLinear(num_steps=200, learning_rate=0.75)

lin_reg.fit(Xtest, ytest)
ypred = lin_reg.predict(Xtest)

print(f'MSE: {mean_squared_error(ytest, ypred)}')


Model trained
MSE: 3.2433479842039236e+155


In [None]:
lin_reg_score_ts = cross_val_score(lin_reg, Xtrain, ytrain, scoring='neg_mean_squared_error', cv=10)
lin_reg_score_ts = (display_scores(lin_reg_score_ts))
lin_reg_score_ts

Model trained
Model trained
Model trained
Model trained
Model trained
Model trained
Model trained
Model trained
Model trained
Model trained

Cross Val Scores: 

Mean: 3.367501170044973e+78
Std Deriv: 1.2458606442298073e+78
Scores: [3.52099453e+78 4.08938651e+78 3.16751243e+78 6.24124646e+78
 3.09738265e+78 3.82007450e+78 3.03484790e+78 1.57866326e+78
 1.61534926e+78 3.50955422e+78]


###Parte 2
Introdução__

Para cada variável explicativa $X_1, .., X_5$, crie outras variáveis usando o __quadrado__ de cada um delas. Desta forma, o conjunto final será de 10 variáveis, em que:

$X_6 = (X_1)^{2}$, $X_7 = (X_2)^{2}$, $X_8 = (X_3)^{2}$, $X_9 = (X_4)^{2}$, $X_{10} = (X_5)^{2}$.

Ao treinarmos uma regressão linear com essas 10 variáveis, a predição é da forma:

$y_{pred} = \theta_0 + \theta_1 \cdot X_1 + .. + \theta_5 \cdot X_5 + \theta_6 \cdot (X_1)^{2} + .. + \theta_{10} \cdot (X_5)^{2}$

Como estamos usando o quadrado das variáveis explicativas, dizemos que temos um __modelo de regressão polinomial de grau 2__. Podemos ter variações deste modelo:

-Podemos aumentar o grau: basta mudar a potência que elevamos as variáveis. Por exemplo, podemos incluir o __cubo__ das variáveis e termos um modelo polinomial de ordem 3.

-Podemos ter __interações__ entre as variáveis: multiplicações entre as variáveis.

Exemplo:

$y_{pred} = \theta_0 + \theta_1 \cdot X_1 + .. + \theta_5 \cdot X_5 + \theta_6 \cdot (X_1)^{2} + .. + \theta_{10} \cdot (X_5)^{2} + \theta_{11} \cdot (X_1)^{3} + \theta_{12} \cdot V1 + \theta_{13} \cdot V2$,

onde

$V_1 = X_1 \cdot X_2$ e $V_2 = (X_2)^{2} \cdot X_4$

Exercício__

1- Estude o link:
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html

em que é discutido como criar modelos polinomiais com o scikit-learn de forma detalhada.

2- Repita os passos da primeira parte, mas agora considerando polinômios de graus 2 ou mais.

3- Inclua regularização Ridge e Lasso nas análises e teste os resultados para diferentes parâmetros $\alpha$.

<br>

In [64]:
# realizando o quadrado das variáveis X

from sklearn.preprocessing import PolynomialFeatures

poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly_features.fit_transform(Xtrain)
X_poly.shape


(7500, 20)

In [65]:
X_poly[:10]

array([[1.31297196e-01, 4.85914587e-01, 2.85904707e-01, 8.73865682e-01,
        4.58547122e-02, 1.72389537e-02, 6.37992229e-02, 3.75384865e-02,
        1.14736114e-01, 6.02059515e-03, 2.36112986e-01, 1.38925268e-01,
        4.24624082e-01, 2.22814736e-02, 8.17415017e-02, 2.49842312e-01,
        1.31100781e-02, 7.63641230e-01, 4.00708594e-02, 2.10265463e-03],
       [9.61526016e-01, 1.13272056e-01, 7.64526683e-01, 4.05853991e-01,
        3.78209558e-02, 9.24532279e-01, 1.08914029e-01, 7.35112295e-01,
        3.90239171e-01, 3.63658330e-02, 1.28305586e-02, 8.65995091e-02,
        4.59719160e-02, 4.28405742e-03, 5.84501049e-01, 3.10286206e-01,
        2.89151299e-02, 1.64717462e-01, 1.53497859e-02, 1.43042470e-03],
       [6.59939163e-01, 2.37834967e-01, 7.90825285e-01, 4.10507602e-01,
        6.90591370e-01, 4.35519699e-01, 1.56956609e-01, 5.21896577e-01,
        2.70910043e-01, 4.55748291e-01, 5.65654716e-02, 1.88085906e-01,
        9.76330620e-02, 1.64246776e-01, 6.25404632e-01, 3.2463

In [66]:
# testando no modelo de rgressão do sklearn

poly_fit = LinearRegression()
poly_fit.fit(X_poly, ytrain)

y_new = poly_fit.predict(X_poly)

print(f'MSE: {mean_squared_error(ytrain, y_new)}')


MSE: 2.002282162883313


In [67]:
poly_reg_score = cross_val_score(poly_fit, X_poly, ytrain, scoring='neg_mean_squared_error', cv=10)
poly_reg_score = (display_scores(poly_reg_score))
poly_reg_score


Cross Val Scores: 

Mean: 1.4193233715978346
Std Deriv: 0.03591524474879628
Scores: [1.36414742 1.3896057  1.4553139  1.44598499 1.45234123 1.36845277
 1.38782802 1.43485399 1.46289158 1.43181413]


In [68]:
# testando no modelo consolidado de regressão linear

poly_fit = regLinear(num_steps=200, learning_rate=0.75)
poly_fit.fit(X_poly, ytrain)

y_new = poly_fit.predict(X_poly)

print(f'MSE: {mean_squared_error(ytrain, y_new)}')


Model trained
MSE: 1.0733365390934909e+262


In [69]:
poly_reg_score = cross_val_score(poly_fit, X_poly, ytrain, scoring='neg_mean_squared_error', cv=10)
poly_reg_score = (display_scores(poly_reg_score))
poly_reg_score

Model trained
Model trained
Model trained
Model trained
Model trained
Model trained
Model trained
Model trained
Model trained
Model trained

Cross Val Scores: 

Mean: 1.1131105396972681e+131
Std Deriv: 5.213320645481129e+130
Scores: [1.32539853e+131 1.49814882e+131 7.98534513e+130 2.32035136e+131
 9.75998888e+130 1.07622446e+131 1.04120657e+131 4.32419889e+130
 4.30922183e+130 1.23190018e+131]


In [73]:
## procurando o melhor parâmetro

steps = [1, 3, 5, 10, 100, 200]
rates = [0.0025, 0.025, 0.5, 0.75, 1, 1.25]

for rate in rates:
  for step in steps:
      lin_reg = regLinear(num_steps=step, learning_rate=rate)
      lin_reg.fit(X_poly, ytrain)

      ypred = lin_reg.predict(X_poly)
      lin_mse = mean_squared_error(ytrain, ypred)

      print(f'Step: {step}')
      print(f'Learning Rate: {rate}')
      print(f'Mse: {mean_squared_error(ytrain, ypred)}')
      print()

Model trained
Step: 1
Learning Rate: 0.0025
Mse: 263.24369053972697

Model trained
Step: 3
Learning Rate: 0.0025
Mse: 211.52700937062687

Model trained
Step: 5
Learning Rate: 0.0025
Mse: 286.59470220776956

Model trained
Step: 10
Learning Rate: 0.0025
Mse: 104.021656326064

Model trained
Step: 100
Learning Rate: 0.0025
Mse: 14.266364096259215

Model trained
Step: 200
Learning Rate: 0.0025
Mse: 10.41600669400927

Model trained
Step: 1
Learning Rate: 0.025
Mse: 119.39338814585622

Model trained
Step: 3
Learning Rate: 0.025
Mse: 59.43690552281012

Model trained
Step: 5
Learning Rate: 0.025
Mse: 35.5904947019083

Model trained
Step: 10
Learning Rate: 0.025
Mse: 15.02040940671429

Model trained
Step: 100
Learning Rate: 0.025
Mse: 7.138380614275067

Model trained
Step: 200
Learning Rate: 0.025
Mse: 6.272269653344616

Model trained
Step: 1
Learning Rate: 0.5
Mse: 754.515063001814

Model trained
Step: 3
Learning Rate: 0.5
Mse: 60258.65466092153

Model trained
Step: 5
Learning Rate: 0.5
Mse: 41

  output_errors = np.average((y_true - y_pred) ** 2, axis=0, weights=sample_weight)
  output_errors = np.average((y_true - y_pred) ** 2, axis=0, weights=sample_weight)
  output_errors = np.average((y_true - y_pred) ** 2, axis=0, weights=sample_weight)
  output_errors = np.average((y_true - y_pred) ** 2, axis=0, weights=sample_weight)


In [74]:
# Consolidando os melhores parâmetros: 

lin_reg_poly_best = regLinear(learning_rate=0.5, num_steps=100)
lin_reg_poly_best.fit(X_poly, ytrain)

y_new = lin_reg_poly_best.predict(X_poly)

print(f'MSE: {mean_squared_error(ytrain, y_new)}')


Model trained
MSE: 4.0148074059523346e+86


In [75]:
poly_reg_score = cross_val_score(lin_reg_poly_best, X_poly, ytrain, scoring='neg_mean_squared_error', cv=10)
poly_reg_score = (display_scores(poly_reg_score))
poly_reg_score

Model trained
Model trained
Model trained
Model trained
Model trained
Model trained
Model trained
Model trained
Model trained
Model trained

Cross Val Scores: 

Mean: 2.0630565969996695e+43
Std Deriv: 5.292274891405337e+42
Scores: [2.01566053e+43 2.37721533e+43 2.07463496e+43 2.97137075e+43
 2.21702267e+43 2.54589889e+43 2.15741950e+43 1.20722563e+43
 1.13671334e+43 1.92740437e+43]


In [76]:
# Validando no conjunto de testes:

poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_poly_test = poly_features.fit_transform(Xtest)

lin_reg_poly_best = regLinear(learning_rate=0.5, num_steps=100)
lin_reg_poly_best.fit(X_poly_test, ytest)

y_new = lin_reg_poly_best.predict(X_poly_test)

print(f'MSE: {mean_squared_error(ytest, y_new)}')


Model trained
MSE: 1.691610819784281e+85


In [77]:
ply_reg_test_score = cross_val_score(lin_reg_poly_best, X_poly_test, ytest, scoring='neg_mean_squared_error', cv=10)
ply_reg_test_score = (display_scores(ply_reg_test_score))
ply_reg_test_score

Model trained
Model trained
Model trained
Model trained
Model trained
Model trained
Model trained
Model trained
Model trained
Model trained

Cross Val Scores: 

Mean: 4.2484327893225244e+42
Std Deriv: 2.107908393154258e+42
Scores: [4.77306777e+42 2.47063995e+42 6.03591994e+42 2.92457218e+42
 1.36446259e+42 6.98577308e+42 2.66395901e+42 3.89432892e+42
 8.28108130e+42 3.09052315e+42]


In [78]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

def polyFit(X, y, grau):
  poylbig_features = PolynomialFeatures(degree=grau, include_bias=False)
  std_scaler = StandardScaler()
  lin_reg = regLinear(learning_rate=0.5, num_steps=100)

  polynomial_regressor = Pipeline([
                                   ('poly_features', poylbig_features),
                                   ('std_scaler', std_scaler),
                                   ('lin_reg', lin_reg),
  ])

  polynomial_regressor.fit(X, y)
  return polynomial_regressor

In [79]:
for grau in [1,2,3,4]:
  print()

  polyfit = polyFit(Xtrain, ytrain, grau)

  ypoly = polyfit.predict(Xtrain)

  print(f'Grau: {grau}')
  print(f'MSE: {mean_squared_error(ytrain, ypoly)}')
  print('-' * 60)


Model trained
Grau: 1
MSE: 6.029690122877917
------------------------------------------------------------

Model trained
Grau: 2
MSE: 1.9929608322759157e+131
------------------------------------------------------------

Model trained
Grau: 3
MSE: 2.17292460452633e+245
------------------------------------------------------------

Model trained
Grau: 4
MSE: inf
------------------------------------------------------------


  output_errors = np.average((y_true - y_pred) ** 2, axis=0, weights=sample_weight)


In [80]:
# Regressão poly Ridge

from sklearn.linear_model import Ridge, Lasso

def polyFitReg(X, y, grau, base_model, base_model_name):
  poylbig_features = PolynomialFeatures(degree=grau, include_bias=False)
  std_scaler = StandardScaler()
  basemodel = base_model

  polynomial_regressor = Pipeline([
                                   ('poly_features', poylbig_features),
                                   ('std_scaler', std_scaler),
                                   (base_model_name, basemodel),
  ])

  polynomial_regressor.fit(X, y)
  return polynomial_regressor

grau = 3

for alpha in [0, 0.001, 0.01, 1, 10, 100, 10000]:
  model_name = 'Ridge_alpha: ' + str(alpha)
  polyfit = polyFitReg(Xtrain,
                       ytrain, 
                       grau, 
                       base_model = Ridge(alpha=alpha),
                       base_model_name = model_name)
  
  ypoly_novo = polyfit.predict(Xtest)

  print(model_name)

  train_error = mean_squared_error(ytrain, polyfit.predict(Xtrain))
  test_error = mean_squared_error(ytest, polyfit.predict(Xtest))

  print(f'MSE (treino): {train_error}')
  print(f'MSE (teste): {test_error}')

  print(f'{train_error-test_error*-1}')

  print('-' * 60)

Ridge_alpha: 0
MSE (treino): 0.3383871229994836
MSE (teste): 0.34448180034016745
0.682868923339651
------------------------------------------------------------
Ridge_alpha: 0.001
MSE (treino): 0.33838713035803825
MSE (teste): 0.3444802568647313
0.6828673872227695
------------------------------------------------------------
Ridge_alpha: 0.01
MSE (treino): 0.33838785693252776
MSE (teste): 0.3444669954653998
0.6828548523979275
------------------------------------------------------------
Ridge_alpha: 1
MSE (treino): 0.3440682641862712
MSE (teste): 0.34836651154433657
0.6924347757306077
------------------------------------------------------------
Ridge_alpha: 10
MSE (treino): 0.510568120906315
MSE (teste): 0.4992457998982876
1.0098139208046026
------------------------------------------------------------
Ridge_alpha: 100
MSE (treino): 1.3433908078430647
MSE (teste): 1.3226150165814976
2.6660058244245626
------------------------------------------------------------
Ridge_alpha: 10000
MSE (trei

In [81]:
# Regressão Poly Lasso

from sklearn.linear_model import Ridge, Lasso

def polyFitReg(X, y, grau, base_model, base_model_name):
  poylbig_features = PolynomialFeatures(degree=grau, include_bias=False)
  std_scaler = StandardScaler()
  basemodel = base_model

  polynomial_regressor = Pipeline([
                                   ('poly_features', poylbig_features),
                                   ('std_scaler', std_scaler),
                                   (base_model_name, basemodel),
  ])

  polynomial_regressor.fit(X, y)
  return polynomial_regressor

grau = 3

for alpha in [0.0001, 0.001, 0.01, 1, 10, 100, 10000]:
  model_name = 'Lasso_alpha: ' + str(alpha)
  polyfit = polyFitReg(Xtrain,
                       ytrain, 
                       grau, 
                       base_model = Lasso(alpha=alpha),
                       base_model_name = model_name)
  
  ypoly_novo = polyfit.predict(Xtrain)

  print(model_name)

  train_error = mean_squared_error(ytrain, polyfit.predict(Xtrain))
  test_error = mean_squared_error(ytest, polyfit.predict(Xtest))

  print(f'MSE (treino): {train_error}')
  print(f'MSE (teste): {test_error}')

  print(f'{train_error-test_error*-1}')

  print('-' * 60)

  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


Lasso_alpha: 0.0001
MSE (treino): 0.5306273409253665
MSE (teste): 0.5377499918989795
1.068377332824346
------------------------------------------------------------


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


Lasso_alpha: 0.001
MSE (treino): 0.4787696536311609
MSE (teste): 0.4859672102473506
0.9647368638785115
------------------------------------------------------------
Lasso_alpha: 0.01
MSE (treino): 0.5382080650645861
MSE (teste): 0.5334770412947463
1.0716851063593325
------------------------------------------------------------
Lasso_alpha: 1
MSE (treino): 9.158016773392355
MSE (teste): 9.795678048590501
18.953694821982857
------------------------------------------------------------
Lasso_alpha: 10
MSE (treino): 24.24325135286504
MSE (teste): 24.383883943845145
48.627135296710186
------------------------------------------------------------
Lasso_alpha: 100
MSE (treino): 24.24325135286504
MSE (teste): 24.383883943845145
48.627135296710186
------------------------------------------------------------
Lasso_alpha: 10000
MSE (treino): 24.24325135286504
MSE (teste): 24.383883943845145
48.627135296710186
------------------------------------------------------------


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


In [82]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet

def polyFitReg(X, y, grau, base_model, base_model_name):
  poylbig_features = PolynomialFeatures(degree=grau, include_bias=False)
  std_scaler = StandardScaler()
  basemodel = base_model

  polynomial_regressor = Pipeline([
                                   ('poly_features', poylbig_features),
                                   ('std_scaler', std_scaler),
                                   (base_model_name, basemodel),
  ])

  polynomial_regressor.fit(X, y)
  return polynomial_regressor

grau = 3

for alpha in [0.0001, 0.001, 0.01, 1, 10, 100, 10000]:
  model_name = 'Lasso_alpha: ' + str(alpha)
  polyfit = polyFitReg(Xtrain,
                       ytrain, 
                       grau, 
                       base_model = ElasticNet(alpha=alpha, l1_ratio=0.5),
                       base_model_name = model_name)
  
  ypoly_novo = polyfit.predict(Xtrain)

  print(model_name)

  train_error = mean_squared_error(ytrain, polyfit.predict(Xtrain))
  test_error = mean_squared_error(ytest, polyfit.predict(Xtest))

  print(f'MSE (treino): {train_error}')
  print(f'MSE (teste): {test_error}')

  print(f'{train_error-test_error*-1}')

  print('-' * 60)

  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


Lasso_alpha: 0.0001
MSE (treino): 0.5244579587791703
MSE (teste): 0.5305061683267995
1.0549641271059698
------------------------------------------------------------


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


Lasso_alpha: 0.001
MSE (treino): 0.46087149764200264
MSE (teste): 0.4595160928179032
0.9203875904599058
------------------------------------------------------------


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


Lasso_alpha: 0.01
MSE (treino): 0.9741795380317108
MSE (teste): 0.9602627957592007
1.9344423337909116
------------------------------------------------------------
Lasso_alpha: 1
MSE (treino): 7.850660796056347
MSE (teste): 8.499339930359003
16.35000072641535
------------------------------------------------------------
Lasso_alpha: 10
MSE (treino): 24.24325135286504
MSE (teste): 24.383883943845145
48.627135296710186
------------------------------------------------------------
Lasso_alpha: 100
MSE (treino): 24.24325135286504
MSE (teste): 24.383883943845145
48.627135296710186
------------------------------------------------------------
Lasso_alpha: 10000
MSE (treino): 24.24325135286504
MSE (teste): 24.383883943845145
48.627135296710186
------------------------------------------------------------


In [104]:
poly_scaler = Pipeline([
                        ('poly_features', PolynomialFeatures(degree=3, include_bias=False)),
                        ('std_scaler', StandardScaler()),
])

X_train_poly_scaled = poly_scaler.fit_transform(Xtest)


In [107]:
lin_reg = regLinear(learning_rate=0.5, num_steps=100)
lin_reg.fit(X_train_poly_scaled, ytest)

scores = cross_val_score(lin_reg, X_train_poly_scaled, ytest, cv=10, scoring='neg_mean_squared_error')
result = np.sqrt(-scores)
result.mean()

Model trained
Model trained
Model trained
Model trained
Model trained
Model trained
Model trained
Model trained
Model trained
Model trained


1.8921509718723085e+121

In [108]:
lasso_reg = Lasso(alpha=0.001)


scores = cross_val_score(lasso_reg, X_train_poly_scaled, ytest, cv=10, scoring='neg_mean_squared_error')
result = np.sqrt(-scores)
result.mean()


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


0.7021425673585878

## Exercício 2: Regressão Logística:


###Parte 1

Crie uma classe regLogistica para treinar o modelo de regressão logística. Essa classe deve ser usada para problemas de classificação binária, cuja variável target assume os valores: 0 (classe negativa) e 1 (classe positiva).

O método construtor dessa classe deve possuir 3 parâmetros: learning_rate, num_steps e limiar.

Os outros médotos devem ser:

    - médoto fit: para treinar o modelo - usando gradient descent
    
    - médoto predict_proba: para retornar a probabilidade da classe 1
    
    - médoto predict: retornar a classe predita: 0 ou 1 - dependente do limiar
    


In [None]:
class regLogistica(BaseEstimator):
  def __init__(self, learning_rate=0.01, num_steps=10, limiar=0.8, info=False):
    self.learning_rate = learning_rate
    self.num_steps = num_steps
    self.limiar = limiar
    self.info = info

  def fit(self, X, y):
    y = y.reshape(-1, 1)
    # treinar o modelo segundo o método gradient descent
    
    # primeiro reshape os dados para o formato adequado
    X_b = np.c_[np.ones(X.shape[0]), X]
    theta = np.random.randn(X_b.shape[1], 1)

    for step in range(self.num_steps):
      #Calculando a probabilidade
      yscores = (1 / (1 + np.exp(-X_b.dot(theta))))

      #Calculando o gradiente do logloss
      gradient = X_b.T.dot(yscores -y)

      #Atualizando os pesos
      theta = theta - self.learning_rate * gradient

      #Calculando o logloss nos passos
      self.logloss_step = ((y * np.log(yscores) + (1 - y) * np.log(1 - yscores)).mean() * -1)

      #Printa as informações
      if self.info:
        print(f'Step: {step}')
        print(f'Theta: {theta.reshape(-1,)}')
        print(f'LogLoss: {self.logloss_step}')
        print()
        print('Model Trained!')
        print('-' * 60)
        print()

      self.theta_final = theta

  def predict(self, X):
    self.ypred = np.where(X > self.limiar, 1, 0)
    return self.ypred
  
  def predict_proba(self, X):
    m = X.shape[0]
    X_b = np.c_[np.ones((m, 1)), X]
    probs = (X_b.dot(self.theta_final))
    return probs


###Parte 2

Usando a função getData2(), carregue o dataset disponibilizado.

Use a regLogistica, classe criada na parte 1 do exercício, para treinar modelos nestes dados. Use validação cruzada para seleção dos parâmetros. Considere diferentes métricas de classificação e justifique as escolhas.


In [None]:
X, y = getData2()
X, y

(array([[-0.82380715, -0.59163837,  0.13041933, -0.40345475,  1.16360785],
        [ 0.7091986 ,  0.60606127, -0.37678226,  0.39654936, -1.15961369],
        [ 1.61194498,  0.36486859,  1.91264129,  0.38601731, -0.31972146],
        ...,
        [ 1.38015938,  1.43125078, -1.42179351,  0.89985272, -0.70967569],
        [-1.63030207, -0.23544436, -2.29968645, -0.32243952, -1.49535664],
        [ 1.07627839,  1.178116  , -1.27826779,  0.73327205, -1.27906183]]),
 array([0, 1, 1, ..., 0, 0, 0]))

In [None]:
reg_log = regLogistica(learning_rate=0.0001, num_steps=10, limiar=0.9, info=True)
reg_log.fit(X, y)


Step: 0
Theta: [ 1.6368773   0.55419767  0.65963913  0.53650266 -1.96427246  2.30179449]
LogLoss: 1.4392522469901945

Model Trained!
------------------------------------------------------------

Step: 1
Theta: [ 1.43346586  0.53159113  0.52498652  0.86388505 -2.03560216  2.03639424]
LogLoss: 1.086414380811293

Model Trained!
------------------------------------------------------------

Step: 2
Theta: [ 1.25557412  0.52542086  0.4403946   1.08405542 -2.07941488  1.79777894]
LogLoss: 0.8714401593430686

Model Trained!
------------------------------------------------------------

Step: 3
Theta: [ 1.10107583  0.52890078  0.38906493  1.23069582 -2.10510221  1.58785682]
LogLoss: 0.7406615580116819

Model Trained!
------------------------------------------------------------

Step: 4
Theta: [ 0.96589911  0.53678385  0.35845947  1.32861716 -2.11969646  1.40353153]
LogLoss: 0.6563525785354732

Model Trained!
------------------------------------------------------------

Step: 5
Theta: [ 0.8466067

In [None]:
probs = reg_log.predict_proba(X)


In [None]:
ypred = reg_log.predict(probs)

In [None]:
ypred.shape, y.shape

((10000, 1), (10000,))

In [None]:
from sklearn.metrics import roc_auc_score

def auc_score(y, ypred):
  accuracy = roc_auc_score(y, ypred)
  print(f'Precisão: {(accuracy *100).round(2)}%')
  return accuracy

In [None]:
auc_score(y, ypred)

Precisão: 80.24%


0.8024330755036286

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
def search_best(X_train, y_train, steps, rates):
  for rate in rates:
    for step in steps:
        reg_log = regLogistica(learning_rate=rate, num_steps=step, limiar=0.8)
        reg_log.fit(X_train, y_train)

        ypred = reg_log.predict(reg_log.predict_proba(X_train))
        score = reg_log.logloss_step

        print(f'Steps: {step}')
        print(f'Learning Rate: {rate}')
        print(f'LogLoss: {score}')
        print()  

In [None]:
steps = [2, 8, 16, 20]
rates = [0.0001, 0.0005, 0.001, 0.005]

search_best(X_train, y_train, steps=steps, rates=rates)

Steps: 2
Learning Rate: 0.0001
LogLoss: 2.075086670841956

Steps: 8
Learning Rate: 0.0001
LogLoss: 0.46352867279704374

Steps: 16
Learning Rate: 0.0001
LogLoss: 0.4087133914178898

Steps: 20
Learning Rate: 0.0001
LogLoss: 0.40324484806727995

Steps: 2
Learning Rate: 0.0005
LogLoss: 0.46608173628076827

Steps: 8
Learning Rate: 0.0005
LogLoss: 0.4025219485870364

Steps: 16
Learning Rate: 0.0005
LogLoss: 0.40251871192896804

Steps: 20
Learning Rate: 0.0005
LogLoss: 0.402518617859431

Steps: 2
Learning Rate: 0.001
LogLoss: 0.6361259037496491

Steps: 8
Learning Rate: 0.001
LogLoss: 0.4025294542887039

Steps: 16
Learning Rate: 0.001
LogLoss: 0.40251862101588926

Steps: 20
Learning Rate: 0.001
LogLoss: 0.40251863035387037

Steps: 2
Learning Rate: 0.005
LogLoss: nan

Steps: 8
Learning Rate: 0.005
LogLoss: nan

Steps: 16
Learning Rate: 0.005
LogLoss: nan

Steps: 20
Learning Rate: 0.005
LogLoss: nan





In [None]:
reg_log = regLogistica(learning_rate=0.0005, num_steps=20, limiar=0.9, info=True)
reg_log.fit(X_train, y_train)

Step: 0
Theta: [ 0.10348466 -0.37595466 -1.13612361  2.17649411  0.28787232  0.44417244]
LogLoss: 2.0982811623026487

Model Trained!
------------------------------------------------------------

Step: 1
Theta: [-0.0198917  -0.12127501 -0.90496857  2.00423798  0.43715305  0.28461909]
LogLoss: 0.475258960666341

Model Trained!
------------------------------------------------------------

Step: 2
Theta: [-0.08285961 -0.01043048 -0.77283717  1.84306239  0.51816689  0.16010748]
LogLoss: 0.42947712293708

Model Trained!
------------------------------------------------------------

Step: 3
Theta: [-0.10276518  0.01148275 -0.71376956  1.7211102   0.5509479   0.07305722]
LogLoss: 0.4123800530338971

Model Trained!
------------------------------------------------------------

Step: 4
Theta: [-1.03604113e-01  1.17184769e-03 -6.89863872e-01  1.63712553e+00
  5.61831110e-01  1.88002299e-02]
LogLoss: 0.40641306547927214

Model Trained!
------------------------------------------------------------

St

In [None]:
probs = reg_log.predict_proba(X_train)
ypred = reg_log.predict(probs)

In [None]:
auc_score(y_train, ypred)


Precisão: 80.6%


0.8060482688119472

In [None]:
# Validando o modelo

reg_log = regLogistica(learning_rate=0.0005, num_steps=20, limiar=0.9, info=False)
reg_log.fit(X_test, y_test)

In [None]:
test_probs = reg_log.predict_proba(X_test)
test_predict = reg_log.predict(test_probs)

In [None]:
auc_score(y_test, test_predict)

Precisão: 78.04%


0.7803793388424461

In [None]:
X_train

array([[ 0.21802299,  0.47654171, -0.90943223,  0.26959324,  0.88071171],
       [ 1.06258502,  0.30328486,  1.08917429,  0.28639959,  0.3806207 ],
       [ 0.99291911,  0.31148497,  0.94097061,  0.28191356, -0.46596404],
       ...,
       [-1.26708786, -0.55511661, -0.76978544, -0.43996549,  0.66656636],
       [ 0.73507723,  0.40140819,  0.22955105,  0.29562512, -1.32870576],
       [-0.82854793,  0.3254663 , -2.38589945,  0.06263802, -0.29400079]])

In [None]:
reg_log.predict(reg_log.predict_proba(np.array([[ 0.21802299,  0.47654171, -0.90943223,  0.26959324,  0.88071171]])))

array([[0]])