In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Maldición de la dimensionalidad

$$ Area_{n-esfera} = \frac{\pi^{\frac{n}{2}}}{\Gamma(\frac{n}{2} + 1)} R^2 $$

In [None]:
from scipy.special import gamma

In [None]:
area_nesfera = []
for n in range(30):
    area_nesfera.append((np.pi ** (n/2)) / (gamma(n/2 + 1)))

In [None]:
plt.plot(area_nesfera)
plt.show()

In [None]:
area_ncubo = []
for n in range(30):
    area_ncubo.append(2 ** n)
    
plt.plot(area_ncubo, 'r', label='area_cubo')
plt.plot(area_nesfera, 'b', label='area_esfera')
plt.ylim([0, 10])
plt.legend()
plt.show()

El volumen se concentra en las esquinas.

# Regularización

## Ridge

Usa un costo cuadrático para los parámetros

$$ \frac{1}{n} \sum_{i=1}^n \left(\theta_0 + \sum_{j=1}^p \theta_j x_{ij}  - y_i \right)^2 + \alpha \sum_{j=0}^p \theta_j^2$$

In [None]:
np.random.seed(42)
x = np.random.rand(100) * 2
y = - 3 * x**2 + 5 * x + np.random.randn(100)

plt.scatter(x, y)
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

In [None]:
X = pd.DataFrame({
    'x': x,
    'x_2': x ** 2,
    'x_3': x ** 3,
    'x_4': x ** 4,
    'x_5': x ** 5
})
X.head(5)

In [None]:
lin_reg = LinearRegression()

lin_reg.fit(X, y)

plt.scatter(x, y)
plt.plot(x[np.argsort(x)], lin_reg.predict(X.iloc[np.argsort(x),:]), 'r')
plt.show()

In [None]:
lin_reg.coef_

In [None]:
ridge = Ridge()

ridge.fit(X, y)

plt.scatter(x, y)
plt.plot(x[np.argsort(x)], ridge.predict(X.iloc[np.argsort(x),:]), 'r')
plt.show()

In [None]:
ridge.coef_

In [None]:
new_x = 0

X = X.append({
    'x': new_x,
    'x_2': new_x ** 2,
    'x_3': new_x ** 3,
    'x_4': new_x ** 4,
    'x_5': new_x ** 5
}, ignore_index=True)

In [None]:
y = np.append(y, [-4])
x = np.append(x, [new_x])

In [None]:
plt.scatter(x, y)
plt.show()

In [None]:
ridge.fit(X, y)
lin_reg.fit(X, y)

plt.scatter(x, y)
plt.plot(x[np.argsort(x)], ridge.predict(X.iloc[np.argsort(x),:]), 'g', label='Ridge')
plt.plot(x[np.argsort(x)], lin_reg.predict(X.iloc[np.argsort(x),:]), 'r', label='LinReg')
plt.legend()
plt.show()

## Nuevos datos

In [None]:
np.random.seed(10)
x2 = np.random.rand(100) * 2 - 1
y2 = x2 * (x2 - 3) * (x2-1) ** 2 * (x2+4) ** 3 * (x2+1) ** 4 + np.random.randn(100) *50

plt.scatter(x2, y2)
plt.show()

In [None]:
X2 = pd.DataFrame({
    'x': x2,
    'x_2': x2 ** 2,
    'x_3': x2 ** 3,
    'x_4': x2 ** 4,
    'x_5': x2 ** 5
})

In [None]:
for a in [0, 5, 10, 20]:
    ridge_a = Ridge(alpha=a)
    ridge_a.fit(X2, y2)
    plt.scatter(x2, y2)
    plt.plot(x2[np.argsort(x2)], ridge_a.predict(X2.iloc[np.argsort(x2),:]), 'g')
    plt.title('alpha = ' + str(a))
    plt.show()

In [None]:
coefs = []

for a in range(20):
    ridge_a = Ridge(alpha=a)
    ridge_a.fit(X2, y2)
    coefs.append(ridge_a.coef_)

In [None]:
pd.DataFrame(coefs, columns=X2.columns).plot()
plt.show()

## Lasso

Usa un costo absoluto (L1) para los parámetros

$$ \frac{1}{n} \sum_{i=1}^n \left(\theta_0 + \sum_{j=1}^p \theta_j x_{ij}  - y_i \right)^2 + \alpha \sum_{j=0}^p |\theta_j|$$

In [None]:
from sklearn.linear_model import Lasso

In [None]:
for a in [1, 3, 5, 10]:
    lasso_a = Lasso(alpha=a)
    lasso_a.fit(X2, y2)
    plt.scatter(x2, y2)
    plt.plot(x2[np.argsort(x2)], lasso_a.predict(X2.iloc[np.argsort(x2),:]), 'g')
    plt.title('alpha = ' + str(a))
    plt.show()

In [None]:
coefs_lasso = []

for a in range(1, 30):
    lasso_a = Lasso(alpha=a)
    lasso_a.fit(X2, y2)
    coefs_lasso.append(lasso_a.coef_)
    
pd.DataFrame(coefs_lasso, columns=X2.columns).plot()
plt.show()

In [None]:
pd.DataFrame(coefs_lasso).head(10)