In [1]:
# evaluate model on training dataset with outliers removed
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import mean_absolute_error

In [2]:
# load the dataset
df = read_csv('housing.csv', header=None)

In [3]:
# retrieve the array
data = df.values

In [4]:
# split into input and output elements
X, y = data[:, :-1], data[:, -1]

In [5]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [6]:
# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)

(339, 13) (339,)


In [7]:
# identify outliers in the training dataset
lof = LocalOutlierFactor()
yhat = lof.fit_predict(X_train)

In [8]:
# select all rows that are not outliers
mask = yhat != -1
X_train, y_train = X_train[mask, :], y_train[mask]

In [9]:
# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)

(305, 13) (305,)


In [10]:
# fit the model
model = LinearRegression()
model.fit(X_train, y_train)

In [11]:
# evaluate the model
yhat = model.predict(X_test)

In [12]:
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
print('MAE : %.3f' % mae)

MAE : 3.356
