# Modelo de Regresión Lineal 

In [None]:
import pandas as pd
import numpy as np

In [None]:
bd5 = pd.read_csv("Players.csv",delimiter=',')
bd5

In [None]:
bd5.loc[(bd5['weight']>50)&(bd5['weight']<=70),'peso'] = 1
bd5.loc[(bd5['weight']>70)&(bd5['weight']<=90),'peso'] = 2
bd5.loc[(bd5['weight']>90)&(bd5['weight']<=110),'peso'] = 3
bd6=bd5.fillna({'peso':4})
bd6=bd5.fillna({'birth_state':0})
bd6=bd5.fillna({'collage':0})
bd6=bd5.fillna({'birth_city':0})
bd6

## Creación de Nuevas Variables - Feature Extraction

In [None]:
bd6['Peso1']=(bd5['peso']==1).astype('int')
bd6['Peso2']=(bd5['peso']==2).astype('int')
bd6['Peso3']=(bd5['peso']==3).astype('int')
bd6['Peso4']=(bd5['peso']==4).astype('int')
bd6.birth_city = bd6.birth_city.fillna(0)
bd6.collage = bd6.collage.fillna(0)
bd6.born = bd6.born.fillna(0)
bd6.weight = bd6.weight.fillna(0)
bd6.height = bd6.height.fillna(0)
bd6.peso = bd6.peso.fillna(0)
bd6.Player = bd6.Player.fillna(0)
bd6.height = bd6.height.astype(int)
bd6.weight = bd6.weight.astype(int)
bd6.born = bd6.born.astype(int)
bd6.peso = bd6.peso.astype(int)


bd6



### Creación manual de nuevas variables

### Resumen Gráfico

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.jointplot(x="height", y="weight", data=bd6)

# Regresión Lineal Multivariable

El objetivo de la regresión Lineal Multivariable es estimar los parámetros $(\beta_0,\beta_1,\beta_2,...,\beta_n)$ tal que el error cuadrático medio MSE entre los datos modelados $y_{model}$ y los datos observados $y_{obs}$ sea mínima.

Los datos modelados se obtienen con la expresión:

$y_{model}= \beta_0 x_0+\beta_1 x_1+\beta_2 x_2 + ...+ \beta_n x_n$

Donde $(x_0,x_1,x_2,...,x_n)$ representan las características (features), normalmente expresadas como columnas.

### Configuración de la base de datos (Selección de variables)

In [None]:
X = bd6[['weight','born']]
y = bd6[['height']]
X.head()

### Partición de datos

- 70% Entrenamiento (train)
- 30% Prueba (test)

In [None]:
from sklearn.model_selection import train_test_split

train_ratio = 0.7
test_ratio = 0.3

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio,random_state=42)
y_train.describe()


### Estandarización de variables

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train= scaler.transform(X_train)
X_test= scaler.transform(X_test)
X_test

### Ajuste del modelo

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(X_train, y_train)


### Coeficientes del modelo

In [None]:
print("model coefficients: {}".format(model.coef_))
print("model intercept: {}".format(model.intercept_))

### Resultado del modelo

In [None]:
print("Training set score: {:.4f} %".format(100*model.score(X_train, y_train)))
print("Test set score: {:.4f} %".format(100*model.score(X_test, y_test)))

### Representación gráfica de resultados

In [None]:
import matplotlib.pyplot as plt
plt.plot(y_test[1:100].to_numpy())
plt.plot(model.predict(X_test[1:100]))
plt.legend(['Real', 'Prediccion'])

In [None]:
pred=y_test
pred['prediction']=model.predict(X_test)
sns.jointplot(x="prediction", y="height", data=pred)

In [None]:
pred['residual']=pred['height']-pred['prediction']
sns.jointplot(x="prediction", y="residual", data=pred)

# Regresión Lineal Multivariable Regularizada

Para el caso de la Regresión Lineal Multivariable Regularizada, los datos modelados se obtienen de la siguiente forma:
$y_{model}= \beta_0 x_0+\beta_1 x_1+\beta_2 x_2 + ...+ \beta_n x_n + \alpha(\beta_0+\beta_1+\beta_2+ ...+ \beta_n)$

El parámetro $\alpha$ permite controlar la complejidad del modelo.

### Creación de variables polinómicas

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(3)
X=poly.fit_transform(X)

### Partición de datos

- 60% Entrenamiento (train)
- 20% Validación (validation)
- 20% Prueba (test)

In [None]:
from sklearn.model_selection import train_test_split

train_ratio = 0.6
validation_ratio = 0.2
test_ratio = 0.2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 - train_ratio)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio))

### Estandarización de variables

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train= scaler.transform(X_train)
X_test= scaler.transform(X_test)
X_val= scaler.transform(X_val)

### Ajuste de modelos para validación

In [None]:
from sklearn.linear_model import Ridge
model_1 = Ridge(alpha=0.1).fit(X_train, y_train)
model_2 = Ridge(alpha=1).fit(X_train, y_train)
model_3 = Ridge(alpha=10).fit(X_train, y_train)
model_4 = Ridge(alpha=100).fit(X_train, y_train)

### Resultados del entrenamiento

In [None]:
print("Training set score model_1: {:.4f} %".format(100*model_1.score(X_train, y_train)))
print("Training set score model_2: {:.4f} %".format(100*model_2.score(X_train, y_train)))
print("Training set score model_3: {:.4f} %".format(100*model_3.score(X_train, y_train)))
print("Training set score model_4: {:.4f} %".format(100*model_4.score(X_train, y_train)))

### Resultado de validación

In [None]:
print("Validation set score model_1: {:.4f} %".format(100*model_1.score(X_val, y_val)))
print("Validation set score model_2: {:.4f} %".format(100*model_2.score(X_val, y_val)))
print("Validation set score model_3: {:.4f} %".format(100*model_3.score(X_val, y_val)))
print("Validation set score model_4: {:.4f} %".format(100*model_4.score(X_val, y_val)))

### Resultado del modelo final

In [None]:
model_final=model_1
print("Training set score: {:.2f} %".format(100*model_final.score(X_train, y_train)))
print("Validation set score: {:.2f} %".format(100*model_final.score(X_val, y_val)))
print("Test set score: {:.2f} %".format(100*model_final.score(X_test, y_test)))