# Regresión Lineal Múltiple


# Importar las librerías


In [29]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.metrics import mean_squared_error

# Importar el data set


Este conjunto de datos contiene información sobre los años de antigüedad, distancia a la estación más cercana, número de tiendas de primera necesidad, latitud y longitud de algunas casas, además del precio por metro cuadrado que será nuestra variable objetivo.

In [None]:
dataset = pd.read_csv("../data/house_price_unit_area.csv")

In [31]:
dataset.head()

Unnamed: 0,house_age,distance_to_the_nearest_station,number_of_convenience_stores,latitude,longitude,house_price_of_unit_area
0,32.0,84.87882,10,24.98298,121.54024,37.9
1,19.5,306.5947,9,24.98034,121.53951,42.2
2,13.3,561.9845,5,24.98746,121.54391,47.3
3,13.3,561.9845,5,24.98746,121.54391,54.8
4,5.0,390.5684,5,24.97937,121.54245,43.1


# Dividir el data set en conjunto de entrenamiento y conjunto de testing


In [78]:
# Variables que utilizaremos para predecir la variable objetivo
X = dataset.iloc[:, [0,1,2,3,4]].values
# Variable objetivo
y = dataset['house_price_of_unit_area']

In [81]:
X.shape

(414, 5)

In [82]:
from sklearn.model_selection import train_test_split

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(289, 5)
(125, 5)
(289,)
(125,)


# Escalado de variables

In [84]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)


# Ajustar el modelo de Regresión lineal múltiple con el conjunto de entrenamiento


In [85]:
from sklearn.linear_model import LinearRegression

In [86]:
regression = LinearRegression()
regression.fit(X_train, y_train)

# Predicción de los resultados en el conjunto de testing


In [87]:
y_pred = regression.predict(X_test)

## Calculamos el R2 score o coeficiente de determinación

In [88]:
# Sacamos el coeficiente de determinación
from sklearn.metrics import r2_score

r2 = r2_score(y_pred = y_pred, y_true = y_test)
r2

0.5681018510026279

In [89]:
# Calculamos el coeficiente de R cuadrado ajustado
def r2_adjusted(y_pred, X_test, r2):    
    N=y_pred.shape[0]
    p= X_test.shape[1]
    x = (1-r2)
    y = (N-1) / (N-p-1)
    adj_rsquared = (1 - (x * y))
    return adj_rsquared

In [90]:
print("Adjusted-R2 : " , r2_adjusted(y_pred, X_test, r2))

Adjusted-R2 :  0.5499548699523181


## Selección de variables automatizada

In [91]:
dataset = pd.read_csv("house_price_unit_area.csv")

# Dividir el data set en conjunto de entrenamiento y conjunto de testing


In [92]:
# Variables que utilizaremos para predecir la variable objetivo
X = dataset.iloc[:, 0:5].values
# Variable objetivo
y = dataset['house_price_of_unit_area']

In [93]:
from sklearn.model_selection import train_test_split

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/3, random_state = 0)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(276, 5)
(138, 5)
(276,)
(138,)


# Escalado de variables

In [95]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)


In [96]:
X_train_df = pd.DataFrame(X_train, columns = dataset.drop('house_price_of_unit_area', axis = 1).columns)
X_test_df = pd.DataFrame(X_test, columns = dataset.drop('house_price_of_unit_area', axis = 1).columns)

In [None]:
def r2_adjusted(y_test, y_pred, r2, X_test):
    N = y_pred.shape[0]
    p = X_test.shape[1]
    t = (1-r2)
    k = (N-1) / (N-p-1)
    adj_rsquared = (1-(t*k))
    return adj_rsquared

In [97]:
from sklearn.metrics import r2_score
dataset_seleccion = pd.DataFrame()
for i in range(1, len(X_train_df.columns)+1):
    print(i)
    sbs = SFS(LinearRegression(),k_features=i,forward=False,floating=False)
    sbs.fit(np.array(X_train), y_train)
    regression = LinearRegression()
    regression.fit(X_train_df[[X_train_df.columns[int(index)] for index in list(sbs.k_feature_idx_)]],
                   y_train)
    y_pred = regression.predict(X_test_df[[X_train_df.columns[int(index)]
                                           for index in list(sbs.k_feature_idx_)]])
    r2 = r2_score(y_pred = y_pred, y_true = y_test)
    r2_adj = r2_adjusted(y_test, y_pred, r2, X_test)
    dataset_seleccion = pd.concat([dataset_seleccion,pd.DataFrame({
        "variables":[[X_train_df.columns[int(index)] for index in list(sbs.k_feature_idx_)]],
        "r2_adj": r2_adj})],ignore_index = True)


1
2
3
4
5


In [98]:
X_train

array([[ 0.42762197, -0.57119674,  2.10153899,  1.12187022,  0.29117841],
       [-1.44478818, -0.71811023,  0.70641028, -0.26735354,  0.50517682],
       [-0.36726913, -0.48218758,  0.3576281 , -0.31125677,  0.27481767],
       ...,
       [-0.33194064,  2.44475207, -1.38628278, -2.37314078, -1.92013857],
       [ 1.63762287, -0.36487056, -0.33993625,  0.51036089,  0.26042022],
       [-0.95018927, -0.79957882,  1.75275681,  0.40844267,  0.64980572]])

In [99]:
dataset_seleccion.sort_values(by = "r2", ascending = False, ignore_index = True)

Unnamed: 0,variables,r2
0,"[house_age, distance_to_the_nearest_station, n...",0.567708
1,"[house_age, distance_to_the_nearest_station, n...",0.567685
2,"[house_age, distance_to_the_nearest_station, n...",0.55998
3,"[house_age, distance_to_the_nearest_station]",0.482721
4,[distance_to_the_nearest_station],0.451544


In [22]:
X = dataset[dataset_seleccion.sort_values(by = "r2", ascending = False, ignore_index = True)["variables"][0]]
y = dataset['house_price_of_unit_area']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [24]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [25]:
regression = LinearRegression()
regression.fit(X_train, y_train)

In [26]:
y_pred = regression.predict(X_test)

In [27]:
r2 = regression.score(X_test, y_test)
r2

0.6420314016343813

In [28]:
print("Adjusted-R2 : " , r2_adjusted(y_pred, X_test, r2))

Adjusted-R2 :  0.6236740376156316
