In [None]:
# Regresion Lineal
import pandas as pd
import numpy as np
import scipy.stats as st
from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as smf

In [None]:
fichero="./vehiculos_procesado.csv"

In [None]:
df=pd.read_csv(fichero)

In [None]:
df.head()

In [None]:
df=df[["consumo","co2","cilindros","desplazamiento"]]

In [None]:
df.shape

In [None]:
modelo=smf.ols('co2~desplazamiento+cilindros+consumo',data=df).fit()

In [None]:
modelo.params

In [None]:
modelo.summary()

$$Y=817+11.7\times desplazamiento + 1.23\times cilindros -19.8 \times consumo$$

In [None]:
parametros=modelo.params.to_dict()
modelo_formula="y~{Intercept:.3f} + {cilindros:.2f}* cilindros + {consumo:.2f}*consumo + {desplazamiento:.2f}".format(**parametros)

In [None]:
import matplotlib.pyplot as plt
plt.scatter(df.co2,modelo.predict(df))

In [None]:
variable_independiente=["desplazamiento","cilindros","consumo"]
variable_dependiente=["co2"]

In [None]:
X=df[variable_independiente]
y=df[variable_dependiente]

In [None]:
modelo=LinearRegression()

In [None]:
modelo_ajustad=modelo.fit(X,y)

In [None]:
y_pred=modelo_ajustad.predict(X)

In [None]:
plt.scatter(y,y_pred)

In [None]:
modelo_ajustad.coef_

In [None]:
modelo_ajustad.intercept_

In [None]:
modelo_ajustad.feature_names_in_

In [None]:
(y-y_pred).mean()

#### Regresión Lineal
$$\hat{y}= \beta_0 + \beta_1 x_1 + \ldots \beta_n x_n $$

$$Y=X \beta$$

$$(X^tX)^{-1}$$

* Ridge
* Lasso

### Ridge

$$(X^tX + \alpha I)^{-1}$$

$$\begin{pmatrix}
1&2\\
2&4
\end{pmatrix}$$


$$\begin{pmatrix}
1&2\\
2&4
\end{pmatrix} + 2 \times I = \begin{pmatrix}
3&2\\
2&6
\end{pmatrix}$$



In [None]:
from sklearn.linear_model import Ridge

In [None]:
from sklearn.datasets import make_regression
import pandas as pd

In [None]:
X,y=make_regression(n_samples=1000)

In [None]:
ridge=Ridge()

In [None]:
ridge.fit(X,y)

* Error Cuadratico Medio **MSE** $\to$  **RMSE**
* Error Absoluto Medio **MAE**
* Error Absoluto Medio Porcentual **MAPE**

Se entrenará con el 70%

In [None]:
muestra=len(X)*0.70
X_Entrenamiento=pd.DataFrame(X).loc[:muestra-1,:]
y_Entrenamiento=pd.DataFrame(y).loc[:muestra-1]

In [None]:
X_Test=pd.DataFrame(X).loc[muestra:,:]
y_Test=pd.DataFrame(y).loc[muestra:]

In [None]:
len(X_Test) + len(X_Entrenamiento)==len(X)

In [None]:
## Lo que haremos será utilizar será la funcion train_test_split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_Entrenamiento,X_Test,y_Entrenamiento,y_Test=train_test_split(X,y,random_state=1234,test_size=0.3)

In [None]:
train_test_split(X,y,random_state=1234,test_size=0.3)

In [None]:
ridge=Ridge()

In [None]:
ridge.fit(X_Entrenamiento,y_Entrenamiento)

In [None]:
ridge.intercept_

In [None]:
ridge.coef_

$$\cfrac{\sum_{i=1}^n{(y_i-\hat{y}_i)^2}}{n}$$

In [None]:
y_pred=ridge.predict(X_Test)

***MSE***

In [None]:
((y_Test - y_pred)**2).sum()/len(y_Test)

* ***RMSE***

In [None]:
(((y_Test - y_pred)**2).sum()/len(y_Test))**(1/2)

In [None]:
y_pred[1]

In [None]:

### aunque sea un array vedlo como si fuera un intervalo
[y_pred[1]-(((y_Test - y_pred)**2).sum()/len(y_Test))**(1/2),y_pred[1]+(((y_Test - y_pred)**2).sum()/len(y_Test))**(1/2)]

In [None]:
y_Test[1]

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.scatter(y_Test,y_pred);

In [None]:
ridge.score(X_Test,y_Test)

In [None]:
ridge_10=Ridge(alpha=10)

In [None]:
ridge_10.fit(X_Entrenamiento,y_Entrenamiento)

In [None]:
y_pred_10=ridge_10.predict(X_Test)

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
mean_squared_error(y_Test,y_pred_10)**(1/2)

In [None]:
ridge_01=Ridge(alpha=0.1)
ridge_01.fit(X_Entrenamiento,y_Entrenamiento)
y_pred_01=ridge_01.predict(X_Test)
mean_squared_error(y_Test,y_pred_01)**(1/2)

### Lasso

In [None]:
from sklearn.linear_model import Lasso

In [None]:
lasso=Lasso()
lasso.fit(X_Entrenamiento,y_Entrenamiento)
y_pred_lasso=lasso.predict(X_Test)
mean_squared_error(y_Test,y_pred_lasso)**(1/2)

In [None]:
import numpy as np

In [None]:
np.sum(lasso.coef_!=0)

In [None]:
lasso=Lasso(alpha=0.001)
lasso.fit(X_Entrenamiento,y_Entrenamiento)
y_pred_lasso=lasso.predict(X_Test)
mean_squared_error(y_Test,y_pred_lasso)**(1/2)

In [None]:
np.sum(lasso.coef_!=0)

### Elastic Net

In [None]:
from sklearn.linear_model import ElasticNet 

In [None]:
elastic=ElasticNet()
elastic.fit(X_Entrenamiento,y_Entrenamiento)
y_pred_elastic=elastic.predict(X_Test)
mean_squared_error(y_Test,y_pred_elastic)**(1/2)

In [None]:
elastic=ElasticNet(alpha=10,l1_ratio=0.8)
elastic.fit(X_Entrenamiento,y_Entrenamiento)
y_pred_elastic=elastic.predict(X_Test)
mean_squared_error(y_Test,y_pred_elastic)**(1/2)

In [None]:
from sklearn.datasets import fetch_california_housing

In [None]:
fetch_california_housing?

In [None]:
datos=fetch_california_housing().data

In [None]:
datos

In [None]:
Target=fetch_california_housing().target

In [None]:
Target.T

In [None]:
fetch_california_housing().feature_names

### Posible Solución

In [None]:
df=pd.DataFrame(datos,columns=fetch_california_housing().feature_names)
### Creamos un dataframe vació para ir incluyendo cada una de las metricas
Metricas=pd.DataFrame(columns=['Modelo',"RMSE","MAE","MAPE"])

In [None]:
import seaborn as sns
sns.lmplot(x='Latitude', y='Longitude',data=df);