In [1]:
import pandas as pd
from sklearn import datasets, linear_model

In [2]:
boston = datasets.load_boston()

In [3]:
X = pd.DataFrame(boston.data, columns=boston.feature_names)
X.head(4)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94


In [4]:
y = pd.Series(boston.target)
y.head(4)

0    24.0
1    21.6
2    34.7
3    33.4
dtype: float64

In [5]:
type(X.head(4))

pandas.core.frame.DataFrame

In [6]:
X.iloc[:4]

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94


In [7]:
X_train = X.iloc[:400]
X_test = X.iloc[400:]
y_train = y.iloc[:400]
y_test = y.iloc[400:]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((400, 13), (400,), (106, 13), (106,))

In [8]:
X_train_no_RAD = X_train.drop(columns=["RAD"])
X_test_no_RAD = X_test.drop(columns=["RAD"])
X_train_no_RAD.shape, X_test_no_RAD.shape

((400, 12), (106, 12))

In [9]:
model = linear_model.LinearRegression().fit(X_train_no_RAD, y_train)

In [10]:
y_pred = model.predict(X_test_no_RAD)
y_pred[:4]

array([12.23802871, 18.49172552, 19.09451984, 12.58283002])

In [11]:
y_pred.shape

(106,)

In [12]:
((y_test - y_pred) ** 2).mean()

23.58360653008989

Alternativa com o scikit-learn:

In [13]:
from sklearn import metrics

In [14]:
metrics.mean_squared_error(y_test, y_pred)

23.58360653008989

Usando variáveis *dummy* ao invés de descartar a coluna categórica:

In [15]:
set(X["RAD"])

{1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 24.0}

In [16]:
pd.get_dummies(X["RAD"]).head()

Unnamed: 0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,24.0
0,1,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0


One-hot encoding:

In [17]:
import numpy as np
from sklearn import preprocessing

In [18]:
ohe = preprocessing.OneHotEncoder(sparse=False).fit(X[["RAD"]])
rad = ohe.transform(X[["RAD"]])
rad.shape

(506, 9)

In [19]:
rad[:5]

array([[1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0.]])

In [20]:
X_train_full = np.concatenate([rad[:400], X_train_no_RAD.values], axis=1)
X_train_full

array([[  1.  ,   0.  ,   0.  , ...,  15.3 , 396.9 ,   4.98],
       [  0.  ,   1.  ,   0.  , ...,  17.8 , 396.9 ,   9.14],
       [  0.  ,   1.  ,   0.  , ...,  17.8 , 392.83,   4.03],
       ...,
       [  0.  ,   0.  ,   0.  , ...,  20.2 , 393.1 ,  19.92],
       [  0.  ,   0.  ,   0.  , ...,  20.2 , 396.9 ,  30.59],
       [  0.  ,   0.  ,   0.  , ...,  20.2 , 338.16,  29.97]])

In [21]:
X_test_full = np.concatenate([rad[400:], X_test_no_RAD.values], axis=1)
X_test_full

array([[  0.  ,   0.  ,   0.  , ...,  20.2 , 396.9 ,  26.77],
       [  0.  ,   0.  ,   0.  , ...,  20.2 , 396.9 ,  20.32],
       [  0.  ,   0.  ,   0.  , ...,  20.2 , 376.11,  20.31],
       ...,
       [  1.  ,   0.  ,   0.  , ...,  21.  , 396.9 ,   5.64],
       [  1.  ,   0.  ,   0.  , ...,  21.  , 393.45,   6.48],
       [  1.  ,   0.  ,   0.  , ...,  21.  , 396.9 ,   7.88]])

In [22]:
full_model = linear_model.LinearRegression().fit(X_train_full, y_train)

In [23]:
y_pred_full = full_model.predict(X_test_full)

In [24]:
metrics.mean_squared_error(y_test, y_pred_full)

42.01654057477302